In [1]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
In [2]:
database_name = "sociology-jstor"
In [3]:
cits = get_cnt("%s.doc" % database_name, keys=['fy.t','t','fy'])
Loaded keys: dict_keys(['fy.t', 't', 'fy'])
Available keys: ['a', 'a.c', 'a.fj.fy', 'c', 'c.c', 'c.fa', 'c.fj', 'c.fy', 'c.t', 'fa', 'fa.fj.fy', 'fj', 'fj.fy', 'fj.t', 'fy', 'fy.t', 't']
In [4]:
RELIABLE_DATA_ENDS_HERE = 2010
import re

def create_tysum(cits):
    
    meta_counters = defaultdict(int)

    ty = defaultdict(lambda:defaultdict(int))

    for comb,count in cits['fy.t'].items():
        ty[comb.t][comb.fy] = count

    tysum = {}
    for ti,t in enumerate(ty):
        meta_counters['at least one citation'] += 1

        count = ty[t]
        prop =  {
            y: county / cits['fy'][ (y,) ]
            for y,county in count.items()
        }

        res = {
            'first': min(count),
            'last': max(count),
            'maxcounty': max(count, key=lambda y:(count[y],y)),
            'maxpropy': max(count, key=lambda y:(prop[y],y))
        }

        res['maxprop'] = prop[ res['maxpropy'] ]
        res['maxcount'] = count[ res['maxcounty'] ]
        res['total'] = sum(count.values())
        res['totalprop'] = sum(prop.values())
        res['name'] = t
                

        
        
        
        
        

        # death3 is last, as long as it's before RELIABLE_DATA_ENDS_HERE
        res['death3'] = None
        if res['last'] <= RELIABLE_DATA_ENDS_HERE:
            res['death3'] = res['last']
        
        
        
        
        

        next_year_sums = [
            (ycheck, sum( c for y,c in count.items() if ycheck + 10 >= y > ycheck ))
            for ycheck in range(res['maxcounty'], RELIABLE_DATA_ENDS_HERE) 
        ]

        # need to make sure ALL subsequent decade intervals are also less...
        my_death_year = None

        l = len(next_year_sums)
        for i in range(l):
            not_this_one = False
            for j in range(i,l):
                if next_year_sums[j][1] >= res['maxcount']:
                    not_this_one = True
                if not_this_one:
                    break

            if not_this_one:
                continue

            my_death_year = next_year_sums[i][0]
            break

        if not len(next_year_sums):
            res['death2'] = None
        else:
            res['death2'] = my_death_year



            
            
            
            
            
        # death1 is max, as long as it's before RELIABLE_DATA_ENDS_HERE
        res['death1'] = None
        if res['maxpropy'] <= RELIABLE_DATA_ENDS_HERE:
            res['death1'] = res['maxcounty']
        
        
            

        # don't care about those with only a single citation
        if res['total'] <= 1:
            meta_counters['literally 1 citation. dropped.'] += 1
            continue

        # we really don't care about those that never rise in use
        #if res['first'] == res['maxpropy']:
        #    continue
        meta_counters['passed tests pre-blacklist'] += 1

        tysum[t] = res
    
    
    
    
    
    
    blacklist = []
    for b in blacklist:
        if b in tysum:
            del tysum[b]
            meta_counters['passed all other tests but was blacklisted'] += 1
            
    print(dict(meta_counters))
    
    return tysum
In [5]:
varname = "%s.tysum"%database_name


tysum = create_tysum(cits)
save_variable(varname, tysum)

print("%s tysum entries for database '%s'" % (len(tysum), database_name))
{'at least one citation': 10130, 'passed tests pre-blacklist': 2538, 'literally 1 citation. dropped.': 7592}
2538 tysum entries for database 'sociology-jstor'