In [ ]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *

instructions carefully set the user parameters and run the entire notebook

output The variable is called "<database_name>.<dtype>.ysum". For example, "sociology-wos.ta.ysum"

description The following notebook performs analyses on the time trend of each "item" of type dtype ("ta" for cited author, "c" for cited work, or "t" for cited term). These statistics are universally useful, and computing them all once saves code and computation time in later analyses. It also standardizes common operations, reducing the probability of a bug going unnoticed.

User parameters

database_name You should already have a database of counts to use this notebook, e.g. "sociology-wos".

dtype The item of interest (e.g. "ta" for cited author, "c" for cited work, or "t" for cited term)

In [2]:
database_name = "sociology-wos-all"
dtype = 'c'

D_CUTOFFS = [0, 0.1, 0.2, 0.3, 0.5]
RB_CUTOFFS = [0.3, 0.5, 1, 2]

compute_deaths = True
In [3]:
# Parameters
database_name = "sociology-wos"
dtype = "ffa"
In [4]:
pubyears = None

if 'wos' in database_name and 'jstor' in database_name:
    raise Exception("Please put 'wos' or 'jstor' but not both in any database_name.")
    
elif 'wos' in database_name:
    pubyears = load_variable("%s.pubyears" % database_name)
    print("Pubyears loaded for %s entries" % len(pubyears.keys()))
    RELIABLE_DATA_ENDS_HERE = 2019
    database_type = 'wos'
    
elif 'jstor' in database_name:
    RELIABLE_DATA_ENDS_HERE = 2010
    database_type = 'jstor'
    
else:
    raise Exception("Please include either 'wos' or 'jstor' in the name of the variable. This keys which data processing algorithm you used.")
Pubyears loaded for 111731 entries

Helper class

In [5]:
class ysums:
    def __init__(self):
        self.ysums = []
        self.meta = defaultdict(int)
    def add(self, x):
        self.ysums.append(x)
In [6]:
class ysum:
    init_stack = []
    
    def __init__(self, **kwargs):        
        self.dict_filter = set(list(self.__dict__)+["dict_filter"]) # save dict keys
        
        for k,v in kwargs.items():
            setattr(self,k,v)
    
        for name in self.init_stack:
            getattr(self,name)()
                
    def dump(self):
        to_return_keys = set(self.__dict__).difference(self.dict_filter)
        
        return {
            k: getattr(self,k)
            for k in to_return_keys
            if k[0] != '_'
        }
    
    @classmethod
    def addinit(cls, st):
        if st not in cls.init_stack:
            cls.init_stack.append(st)

Getting data

In [7]:
ysum.cits = get_cnt("%s.doc"%database_name, [comb(dtype,'fy'),'fy',dtype])
Loaded keys: dict_keys(['ffa.fy', 'fy', 'ffa'])
Available keys: ['a', 'c', 'c.c', 'c.fj', 'c.fy', 'c.fy.j', 'fa', 'fa.c', 'fa.fj', 'fa.fj.fy', 'fa.fy', 'ffa', 'ffa.c', 'ffa.fj', 'ffa.fy', 'fj', 'fj.fy', 'fj.ta', 'fj.ty', 'fy', 'fy.ta', 'fy.ty', 'ta', 'ty', 'ty.ty']

The following reorganizes the data slightly, for efficiency

In [8]:
ysum.cy = defaultdict(lambda:defaultdict(int))

for cross,count in ysum.cits[ comb(dtype, 'fy') ].items():
    ysum.cy[ getattr(cross, dtype) ][ cross.fy ] = count

computing simple and universal statistics

In [9]:
ysum.addinit("simple_stats")

class ysum(ysum):
    
    def simple_stats(self):
        self._c = self.cy[ self.name ]
        self._p =  {
            y: count / self.cits['fy'][(y,)]
            for y,count in self._c.items()
            if count > 0 and self.cits['fy'][(y,)] > 0
        }
        
        self.first = min(self._c)
        self.last = max(self._c)
        self.maxcounty = max(self._c, key=lambda y:(self._c[y],y))
        self.maxpropy = max(self._p, key=lambda y:(self._p[y],y))

        self.maxprop = self._p[ self.maxpropy ]
        self.maxcount = self._c[ self.maxcounty ]
        self.total = sum(self._c.values())
        self.totalprop = sum(self._p.values())
        
        if dtype == 'c':
            # extracts some extra information from the name
            
            self.type = 'article'
            if database_type == 'wos':
                sp = self.name.split("|")
                try:
                    self.pub = int(sp[1])
                    self.type = 'article'
                except ValueError:
                    self.type = 'book'
                    self.pub = pubyears[self.name]

            elif database_type == 'jstor':
                inparens = re.findall(r'\(([^)]+)\)', self.name)[0]
                self.pub = int(inparens)
    
            
    def sum_between(self, A, B): #not including B
        if A > B:
            raise Exception("Can only sum forwards in time. Fix your code...")
        return sum( c for y,c in self._c.items() if B > y >= A )

definitions of death

death$_1$ is the last year of maximum citations, as long as it's before RELIABLE_DATA_ENDS_HERE

In [10]:
#ysum.addinit("_death1")

class ysum(ysum):
    def _death1(self):
        # DEFINING DEATH1
        # death1 is max, as long as it's before RELIABLE_DATA_ENDS_HERE
        if self.maxpropy <= RELIABLE_DATA_ENDS_HERE:
            self.death1 = self.maxcounty
            return
        
        self.death1 = None

death$_2$

In [11]:
#ysum.addinit("_death2")

class ysum(ysum):
    def _death2(self):

        # this list has an entry for each year after and including the maximum citations ever received (the last time)
        # look ahead to the next ten years and take the average
        next_year_sums = [
            (ycheck, self.sum_between(ycheck, ycheck+9))
            for ycheck in range(self.maxcounty, RELIABLE_DATA_ENDS_HERE - 10) 
        ]

        # need to make sure ALL subsequent decade intervals are also less...
        my_death_year = None

        l = len(next_year_sums)
        for i in range(l):
            not_this_one = False
            for j in range(i,l):
                if next_year_sums[j][1] >= self.maxcount:
                    not_this_one = True
                if not_this_one:
                    break

            if not_this_one:
                continue

            my_death_year = next_year_sums[i][0]
            break

        if not len(next_year_sums):
            self.death2 = None
        else:
            self.death2 = my_death_year

        return None
    
In [12]:
#ysum.addinit("_death3")

class ysum(ysum):
    def _death3(self):
        # DEATH3 is last, as long as it's before RELIABLE_DATA_ENDS_HERE - 10
        # no citations in the last 10 years
        self.death3 = None
        if self.last <= RELIABLE_DATA_ENDS_HERE - 10:
            self.death3 = self.last
In [13]:
#ysum.addinit("_death5")

class ysum(ysum):
    def _death5(self):


        # DEATH5
        # 90% of their citations were received before death4, and it's been at least 30% of their lifespan
        myspan = np.array( [self.cits[ comb(dtype, 'fy')][ make_cross({dtype:self.name, 'fy':ycheck}) ] for ycheck in range(1900, 2020)] )

        self.death5 = None

        Ea = np.sum(myspan)
        csum = np.sum(myspan)

        nonzeroyears = list(np.where(myspan>0))
        if not len(nonzeroyears):
            self.death5 = None
            return

        try:
            firsti = np.min(nonzeroyears)
        except:
            print("some strange error, that shouldn't happen, right??")
            return

        first_year = firsti + 1900

        for cci, cc in enumerate(myspan[firsti:]):

            this_year = first_year+cci

            # running residual... 
            Ea -= cc

            # don't let them die too soon
            if cc == 0:
                continue

            if Ea/csum < 0.1 and (RELIABLE_DATA_ENDS_HERE - this_year)/(RELIABLE_DATA_ENDS_HERE - first_year) > 0.3:
                self.death5 = this_year
                break
In [14]:
if compute_deaths:
    ysum.addinit("_deathN")

class ysum(ysum):
    def _deathN(self):

        
        
        
        
        def getname(CUTOFF):            
            return "death_%d"%(CUTOFF*10)
        
        def getrname(dCUTOFF, rbCUTOFF):            
            return "rebirth_%d_%d"%(dCUTOFF*10, rbCUTOFF*10)
        

        # DEATH5
        # 90% of their citations/year were received before death4, and it's been at least 30% of their lifespan
        
        
        # preset all deaths as None
        for CUTOFF in D_CUTOFFS:
            setattr(self, getname(CUTOFF), None)
        for dCUTOFF in D_CUTOFFS:
            for rbCUTOFF in RB_CUTOFFS:
                setattr(self, getrname(dCUTOFF, rbCUTOFF), None)


        
        for y in range(self.maxcounty + 1, RELIABLE_DATA_ENDS_HERE):            
            #if (RELIABLE_DATA_ENDS_HERE - y)/(RELIABLE_DATA_ENDS_HERE - self.first) < 0.3:
            #    # eventually we just can't tell anymore...
            #    continue
            if RELIABLE_DATA_ENDS_HERE - y <= 10:
                continue
                
            # death
            
                
            before = self.sum_between(self.first, y) / (y - self.first)
            after = self.sum_between(y,y+10) / 10
            
            for CUTOFF in D_CUTOFFS:
                myname = getname(CUTOFF)
                    
                if getattr(self, myname) is not None:
                    continue
                    
                if after/before <= CUTOFF:
                    setattr(self, myname, y)
                    
            
            
            # rebirth
            
            after = self.sum_between(y,y+10) / 10
            
            for D_CUTOFF in D_CUTOFFS:
                mydname = getname(D_CUTOFF)
                
                for RB_CUTOFF in RB_CUTOFFS:
                    myrname = getrname(D_CUTOFF,RB_CUTOFF)

                    mydeath = getattr(self, mydname)

                    if mydeath is None: # if there's no death, there can't be a rebirth
                        continue
                    if not( y >= mydeath+10 ): # at least 10 years dead!
                        continue

                    before = self.sum_between(self.first, mydeath) / (mydeath - self.first)

                    if after/before > RB_CUTOFF:
                        if getattr(self, myrname) is None:
                            y = min( yy for yy,c in self._c.items() if yy >= y and c > 0  ) # push the rebirth to the next citation...
                            
                            setattr(self, myrname, y)
                    
        # label those on the cusp explicitly (look at y=RELIABLE_DATA_ENDS_HERE-10)
        # if they wouldn't die that year, mark them as True.
        # if they would die in any year after, mark them as None

master loop

In [15]:
print("Processing database '%s'"%database_name)

mysums = ysums()
for ci,item in enumerate(ysum.cy):
    
    if ci % 5000 == 0:
        print("Item %s" % ci)
        
    if ci % 50000 == 0:
        print(mysums.meta)
        
    #if ci > 10000:
    #    break
        
    res = {}
    mysums.meta['at least one citation'] += 1
    
    if  database_type == 'wos' and dtype == 'c':
        sp = item.split("|")
        if len(sp) < 2:
            mysums.meta['not enough parts'] += 1
            continue

    mysum = ysum( name=item )

    #small error catch
    if hasattr(mysum,'pub') and mysum.first < mysum.pub:
        mysums.meta['first citation before pub date'] += 1
        continue

    # don't care about those with only a single citation
    if mysum.total <= 1:
        mysums.meta['literally 1 citation. dropped.'] += 1
        continue

    # we really don't care about those that never rise in use
    if mysum.first == mysum.maxpropy:
        mysums.meta['never rise'] += 1
        continue
        
    mysums.meta['passed tests pre-blacklist'] += 1

    
    mysums.add(mysum)
Processing database 'sociology-wos'
Item 0
defaultdict(<class 'int'>, {})
Item 5000
Item 10000
Item 15000
Item 20000
Item 25000
Item 30000
Item 35000

Debugging goes here

In [16]:
for ys in mysums.ysums[:10]:
    dd = ys.dump()
    for k in sorted(dd):
        print(k, dd[k])
        
    print("----------------------------")
death_0 None
death_1 None
death_2 None
death_3 None
death_5 None
first 2011
last 2017
maxcount 2
maxcounty 2015
maxprop 0.0006754474839581223
maxpropy 2015
name hooghe, m
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 None
rebirth_5_5 None
total 6
totalprop 0.0020860875658478503
----------------------------
death_0 None
death_1 None
death_2 None
death_3 None
death_5 None
first 2011
last 2018
maxcount 2
maxcounty 2013
maxprop 0.0006963788300835655
maxpropy 2013
name gondal, n
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 None
rebirth_5_5 None
total 6
totalprop 0.00210948546559322
----------------------------
death_0 None
death_1 None
death_2 None
death_3 None
death_5 None
first 1995
last 2016
maxcount 3
maxcounty 2013
maxprop 0.0010445682451253482
maxpropy 2013
name franzen, a
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 None
rebirth_5_5 None
total 11
totalprop 0.00500903153824101
----------------------------
death_0 None
death_1 None
death_2 None
death_3 None
death_5 1995
first 1978
last 2015
maxcount 2
maxcounty 1990
maxprop 0.0015048908954100827
maxpropy 1990
name friedkin, n
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 2009
rebirth_5_5 2009
total 20
totalprop 0.014073701394404638
----------------------------
death_0 2000
death_1 2000
death_2 1994
death_3 1994
death_5 1990
first 1982
last 2019
maxcount 2
maxcounty 1989
maxprop 0.001444043321299639
maxpropy 1989
name wallace, m
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 2011
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 2011
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 2011
rebirth_5_5 None
total 10
totalprop 0.006331034301149077
----------------------------
death_0 None
death_1 None
death_2 None
death_3 None
death_5 None
first 2008
last 2019
maxcount 2
maxcounty 2014
maxprop 0.0006724949562878278
maxpropy 2014
name pais, j
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 None
rebirth_5_5 None
total 7
totalprop 0.0025622212558883807
----------------------------
death_0 None
death_1 None
death_2 None
death_3 None
death_5 None
first 2000
last 2012
maxcount 2
maxcounty 2011
maxprop 0.0012062726176115801
maxpropy 2001
name crowder, k
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 None
rebirth_5_5 None
total 11
totalprop 0.005646195842509818
----------------------------
death_0 None
death_1 None
death_2 None
death_3 None
death_5 None
first 2001
last 2015
maxcount 3
maxcounty 2011
maxprop 0.0017657445556209534
maxpropy 2007
name alon, s
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 None
rebirth_5_5 None
total 12
totalprop 0.0059347316201944856
----------------------------
death_0 None
death_1 None
death_2 None
death_3 None
death_5 None
first 1998
last 2016
maxcount 2
maxcounty 2005
maxprop 0.0011813349084465446
maxpropy 2005
name carbonaro, w
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 None
rebirth_5_5 None
total 8
totalprop 0.0039172848209811165
----------------------------
death_0 None
death_1 None
death_2 None
death_3 None
death_5 None
first 1990
last 2018
maxcount 2
maxcounty 2018
maxprop 0.0014705882352941176
maxpropy 1991
name sherkat, d
rebirth_0_10 None
rebirth_0_20 None
rebirth_0_3 None
rebirth_0_5 None
rebirth_1_10 None
rebirth_1_20 None
rebirth_1_3 None
rebirth_1_5 None
rebirth_2_10 None
rebirth_2_20 None
rebirth_2_3 None
rebirth_2_5 None
rebirth_3_10 None
rebirth_3_20 None
rebirth_3_3 None
rebirth_3_5 None
rebirth_5_10 None
rebirth_5_20 None
rebirth_5_3 None
rebirth_5_5 None
total 21
totalprop 0.012290534637626072
----------------------------
In [17]:
for i in range(0,10):
    rbname = "rebirth_5_%s" % i
    for x in mysums.ysums:
        if not hasattr(x,rbname):
            setattr(x,rbname,None)
In [18]:
sorted(mysums.ysums[0].dump())
Out[18]:
['death_0',
 'death_1',
 'death_2',
 'death_3',
 'death_5',
 'first',
 'last',
 'maxcount',
 'maxcounty',
 'maxprop',
 'maxpropy',
 'name',
 'rebirth_0_10',
 'rebirth_0_20',
 'rebirth_0_3',
 'rebirth_0_5',
 'rebirth_1_10',
 'rebirth_1_20',
 'rebirth_1_3',
 'rebirth_1_5',
 'rebirth_2_10',
 'rebirth_2_20',
 'rebirth_2_3',
 'rebirth_2_5',
 'rebirth_3_10',
 'rebirth_3_20',
 'rebirth_3_3',
 'rebirth_3_5',
 'rebirth_5_0',
 'rebirth_5_1',
 'rebirth_5_10',
 'rebirth_5_2',
 'rebirth_5_20',
 'rebirth_5_3',
 'rebirth_5_4',
 'rebirth_5_5',
 'rebirth_5_6',
 'rebirth_5_7',
 'rebirth_5_8',
 'rebirth_5_9',
 'total',
 'totalprop']
In [19]:
sum(x.rebirth_5_3 is not None for x in mysums.ysums)
Out[19]:
159
In [20]:
print(mysums.meta)
defaultdict(<class 'int'>, {'at least one citation': 39031, 'literally 1 citation. dropped.': 24226, 'passed tests pre-blacklist': 2911, 'never rise': 11894})
In [21]:
if False:
    mysum_final = pd.DataFrame.from_records( list(x.dump() for x in mysums.ysums) )
    mysum_final.shape
    
if True:
    mysum_final = {}
    for x in mysums.ysums:
        dp = x.dump()
        mysum_final[dp['name']] = dp
In [22]:
varname = "%s.%s.ysum"%(database_name,dtype)
save_variable(varname, mysum_final)

only necessary if you plan on filtering based on this set

In [23]:
if True:
    save_variable("%s.included_citations"%database_name, set(x.name for x in mysums.ysums))
In [ ]: