How many responses per contest?ΒΆ

This notebook will analyze the number of responses/captions per contest, both treating all contest equally and throughout time.

[1]:
import os
import sys
from pathlib import Path
p = Path(".").absolute()
root = p.parent.parent
sys.path.append(str(root))
[2]:
import caption_contest_data as ccd
[3]:
contests = ccd.summary_ids()
print(contests)
{'509-round1_summary_LilUCB.csv', 520, 521, 522, 523, 524, '559_summary_KLUCB.csv', 526, 527, 528, 529, 530, '511_summary_LilUCB.csv', '517_summary_LilUCB.csv', 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 552, 553, 554, 555, 557, 558, 551, 556, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, '514_summary_RoundRobin.csv', 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, '512_summary_RoundRobin.csv', 634, 635, 636, '513_summary_RoundRobin.csv', 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, '508-round1_summary_RoundRobin.csv', 665, 666, 667, 668, '508-round1_summary_LilUCB.csv', '515_summary_RoundRobin.csv', 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, '517_summary_RoundRobin.csv', '513_summary_LilUCB.csv', '559_summary_LilUCB.csv', '560_summary_KLUCB_funny.csv', '508-round2_summary_RoundRobin.csv', '514_summary_LilUCB.csv', '509-round2_summary_RoundRobin.csv', '515_summary_LilUCB.csv', '510_summary_LilUCB.csv', '516_summary_LilUCB.csv', '511_summary_RoundRobin.csv', '510_summary_RoundRobin.csv', '512_summary_LilUCB.csv', '518_summary_LilUCB.csv', '560_summary_KLUCB_original.csv', '519_summary_RoundRobin.csv', '559_summary_RandomSamping.csv', '519_summary_LilUCB.csv', '509-round1_summary_RoundRobin.csv', '518_summary_RoundRobin.csv', '516_summary_RoundRobin.csv'}
[4]:
len(contests)
[4]:
193
[5]:
summaries = [ccd.summary(c) for c in contests]
[6]:
import pandas as pd
summary = pd.concat(summaries, sort=False)
[7]:
summary.head()
[7]:
target_id rank funny somewhat_funny unfunny count score precision contest caption
0 6785.0 1 22 18 12 52 2.192308 0.109979 509 I recommend the trout.
1 3197.0 2 5 11 7 23 1.913043 0.152878 509 May I recommend the trout?
2 469.0 3 5 9 8 22 1.863636 0.165085 509 On second thought, maybe we should just go wit...
3 1800.0 4 5 6 8 19 1.842105 0.191381 509 Take my wife, please!
4 2317.0 5 5 5 9 19 1.789474 0.196146 509 All that work to die a virgin.
[8]:
import numpy as np
counts = summary.pivot_table(index="contest", values="count", aggfunc=np.sum)
nunique = summary.pivot_table(index="contest", values="caption", aggfunc=lambda x: x.nunique())
df = pd.merge(counts, nunique, left_index=True, right_index=True)
df["n_unique"] = df["caption"]
df.drop(columns="caption", inplace=True)
df.head()
[8]:
count n_unique
contest
508 137868 6778
509 98241 7858
510 82627 3905
511 56660 3325
512 43120 4399
[9]:
df.index.unique()
[9]:
Int64Index([508, 509, 510, 511, 512, 513, 514, 515, 516, 517,
            ...
            676, 677, 678, 679, 680, 681, 682, 683, 684, 685],
           dtype='int64', name='contest', length=176)
[10]:
"{} million human responses".format(df["count"].sum() / 1e6)
[10]:
'119.157775 million human responses'
[11]:
"{} captions".format(df["n_unique"].sum())
[11]:
'970327 captions'
[12]:
"{} contests".format(len(df))
[12]:
'176 contests'
[13]:
responses = df["count"] / 1e6
ax = responses.plot.hist(bins=20)
ax.set_xlabel("Responses (millions)")
[13]:
Text(0.5, 0, 'Responses (millions)')
../_images/example-analyses_Number-of-responses_14_1.png
[14]:
responses = df["n_unique"]
ax = responses.plot.hist(bins=20)
ax.set_xlabel("Unique captions")
[14]:
Text(0.5, 0, 'Unique captions')
../_images/example-analyses_Number-of-responses_15_1.png
[15]:
ax = df.reset_index().plot(x="contest", y="count", style="o")
ax.set_yscale("log")
ax.grid()
ax.grid(which="minor", alpha=0.4)
ax.set_ylabel("Number of responses")
[15]:
Text(0, 0.5, 'Number of responses')
../_images/example-analyses_Number-of-responses_16_1.png
[16]:
ax = df.reset_index().plot(x="contest", y="n_unique", style="o")
ax.set_ylabel("Number of captions")
ax.grid(alpha=0.4)
../_images/example-analyses_Number-of-responses_17_0.png
[ ]: