imports

In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
In [2]:
database_name = "sociology-wos"
In [8]:
docs = get_cnt("%s.doc"%database_name, ['fy.ta','ta','fy'])

ay = defaultdict(lambda:defaultdict(int))

for (y,a),c in docs['fy.ta'].items():
    ay[a][y] += c
Loaded keys: dict_keys(['fy.ta', 'ta', 'fy'])
Available keys: ['c', 'c.fj', 'c.fy', 'c.fy.j', 'fa', 'fa.c', 'fa.fj', 'fa.fj.fy', 'fa.fy', 'fj', 'fj.fy', 'fj.ta', 'fj.ty', 'fy', 'fy.ta', 'fy.ty', 'ta', 'ty']
In [9]:
aysum = {}

for ai,a in enumerate(ay):
    YRLOOK
    if docs['ta'][a] == 0:
        continue
        

    count = ay[a]
    prop =  {
        y: county / docs['fy'][y]
        for y,county in count.items()
    }
    
    if not len(prop):
        print(a)

    res = {
        'first': min(count),
        'last': max(count),
        'maxcounty': max(count, key=lambda y:(count[y],y)),
        'maxpropy': max(count, key=lambda y:(prop[y],y))
    }

    res['maxprop'] = prop[ res['maxpropy'] ]
    res['maxcount'] = count[ res['maxcounty'] ]
    res['total'] = sum(count.values())
    res['totalprop'] = sum(prop.values())
    res['name'] = a
    
    
    # death3 is the last, as long as it's before 2005
    res['death3'] = None
    if res['last'] <= 2005:
        res['death3'] = res['last']
        
        
        
        
        

    # generating "death2", max then decade with total less than that forever
    
    next_year_sums = [
        (ycheck, sum( a for y,a in prop.items() if ycheck + 10 >= y > ycheck ))
        for ycheck in range(res['maxcounty'], 2005)
    ]

    # need to make sure ALL subsequent decade intervals are also less...
    my_death_year = None

    l = len(next_year_sums)
    for i in range(l):
        not_this_one = False
        for j in range(i,l):
            if next_year_sums[j][1] >= res['maxcount']:
                not_this_one = True
                break

        if not_this_one:
            continue

        my_death_year = next_year_sums[i][0]
        break

    if not len(next_year_sums):
        res['death2'] = None
    else:
        res['death2'] = my_death_year
        
        
        

    
    # death3 is the max, as long as it's before 2005
    res['death1'] = None
    if res['maxcounty'] <= 2005:
        res['death1'] = res['maxcounty']


        

    # don't care about those with only a single publication
    if res['total'] <= 1:
        continue

    # we really don't care about those that never rise in use
    if res['first'] == res['maxcounty']:
        continue

    aysum[a] = res
In [10]:
list(aysum.values())[-5:]
Out[10]:
[{'first': 1926,
  'last': 1930,
  'maxcounty': 1930,
  'maxpropy': 1930,
  'maxprop': 0.03333333333333333,
  'maxcount': 1,
  'total': 2,
  'totalprop': 0.06559139784946236,
  'name': 'dealey',
  'death3': 1930,
  'death2': 1930,
  'death1': 1930},
 {'first': 1926,
  'last': 1929,
  'maxcounty': 1929,
  'maxpropy': 1929,
  'maxprop': 0.045454545454545456,
  'maxcount': 1,
  'total': 2,
  'totalprop': 0.07771260997067449,
  'name': 'mateer',
  'death3': 1929,
  'death2': 1929,
  'death1': 1929},
 {'first': 1927,
  'last': 1929,
  'maxcounty': 1929,
  'maxpropy': 1927,
  'maxprop': 0.05555555555555555,
  'maxcount': 1,
  'total': 2,
  'totalprop': 0.10101010101010101,
  'name': 'maulsby',
  'death3': 1929,
  'death2': 1929,
  'death1': 1929},
 {'first': 1917,
  'last': 1925,
  'maxcounty': 1925,
  'maxpropy': 1917,
  'maxprop': 0.3333333333333333,
  'maxcount': 1,
  'total': 2,
  'totalprop': 0.367816091954023,
  'name': 'conradi',
  'death3': 1925,
  'death2': 1925,
  'death1': 1925},
 {'first': 1902,
  'last': 1904,
  'maxcounty': 1904,
  'maxpropy': 1904,
  'maxprop': 1.0,
  'maxcount': 1,
  'total': 2,
  'totalprop': 1.2,
  'name': 'degreef',
  'death3': 1904,
  'death2': 1904,
  'death1': 1904}]
In [11]:
save_variable("%s.aysum" % database_name, aysum)