import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
database_name = "sociology-wos"
docs = get_cnt("%s.doc"%database_name, ['fy.ta','ta','fy'])
ay = defaultdict(lambda:defaultdict(int))
for (y,a),c in docs['fy.ta'].items():
ay[a][y] += c
aysum = {}
for ai,a in enumerate(ay):
YRLOOK
if docs['ta'][a] == 0:
continue
count = ay[a]
prop = {
y: county / docs['fy'][y]
for y,county in count.items()
}
if not len(prop):
print(a)
res = {
'first': min(count),
'last': max(count),
'maxcounty': max(count, key=lambda y:(count[y],y)),
'maxpropy': max(count, key=lambda y:(prop[y],y))
}
res['maxprop'] = prop[ res['maxpropy'] ]
res['maxcount'] = count[ res['maxcounty'] ]
res['total'] = sum(count.values())
res['totalprop'] = sum(prop.values())
res['name'] = a
# death3 is the last, as long as it's before 2005
res['death3'] = None
if res['last'] <= 2005:
res['death3'] = res['last']
# generating "death2", max then decade with total less than that forever
next_year_sums = [
(ycheck, sum( a for y,a in prop.items() if ycheck + 10 >= y > ycheck ))
for ycheck in range(res['maxcounty'], 2005)
]
# need to make sure ALL subsequent decade intervals are also less...
my_death_year = None
l = len(next_year_sums)
for i in range(l):
not_this_one = False
for j in range(i,l):
if next_year_sums[j][1] >= res['maxcount']:
not_this_one = True
break
if not_this_one:
continue
my_death_year = next_year_sums[i][0]
break
if not len(next_year_sums):
res['death2'] = None
else:
res['death2'] = my_death_year
# death3 is the max, as long as it's before 2005
res['death1'] = None
if res['maxcounty'] <= 2005:
res['death1'] = res['maxcounty']
# don't care about those with only a single publication
if res['total'] <= 1:
continue
# we really don't care about those that never rise in use
if res['first'] == res['maxcounty']:
continue
aysum[a] = res
list(aysum.values())[-5:]
save_variable("%s.aysum" % database_name, aysum)