import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
database_name = "sociology-jstor"
cits = get_cnt("%s.doc" % database_name, keys=['fy.t','t','fy'])
RELIABLE_DATA_ENDS_HERE = 2010
import re
def create_tysum(cits):
meta_counters = defaultdict(int)
ty = defaultdict(lambda:defaultdict(int))
for comb,count in cits['fy.t'].items():
ty[comb.t][comb.fy] = count
tysum = {}
for ti,t in enumerate(ty):
meta_counters['at least one citation'] += 1
count = ty[t]
prop = {
y: county / cits['fy'][ (y,) ]
for y,county in count.items()
}
res = {
'first': min(count),
'last': max(count),
'maxcounty': max(count, key=lambda y:(count[y],y)),
'maxpropy': max(count, key=lambda y:(prop[y],y))
}
res['maxprop'] = prop[ res['maxpropy'] ]
res['maxcount'] = count[ res['maxcounty'] ]
res['total'] = sum(count.values())
res['totalprop'] = sum(prop.values())
res['name'] = t
# death3 is last, as long as it's before RELIABLE_DATA_ENDS_HERE
res['death3'] = None
if res['last'] <= RELIABLE_DATA_ENDS_HERE:
res['death3'] = res['last']
next_year_sums = [
(ycheck, sum( c for y,c in count.items() if ycheck + 10 >= y > ycheck ))
for ycheck in range(res['maxcounty'], RELIABLE_DATA_ENDS_HERE)
]
# need to make sure ALL subsequent decade intervals are also less...
my_death_year = None
l = len(next_year_sums)
for i in range(l):
not_this_one = False
for j in range(i,l):
if next_year_sums[j][1] >= res['maxcount']:
not_this_one = True
if not_this_one:
break
if not_this_one:
continue
my_death_year = next_year_sums[i][0]
break
if not len(next_year_sums):
res['death2'] = None
else:
res['death2'] = my_death_year
# death1 is max, as long as it's before RELIABLE_DATA_ENDS_HERE
res['death1'] = None
if res['maxpropy'] <= RELIABLE_DATA_ENDS_HERE:
res['death1'] = res['maxcounty']
# don't care about those with only a single citation
if res['total'] <= 1:
meta_counters['literally 1 citation. dropped.'] += 1
continue
# we really don't care about those that never rise in use
#if res['first'] == res['maxpropy']:
# continue
meta_counters['passed tests pre-blacklist'] += 1
tysum[t] = res
blacklist = []
for b in blacklist:
if b in tysum:
del tysum[b]
meta_counters['passed all other tests but was blacklisted'] += 1
print(dict(meta_counters))
return tysum
varname = "%s.tysum"%database_name
tysum = create_tysum(cits)
save_variable(varname, tysum)
print("%s tysum entries for database '%s'" % (len(tysum), database_name))