finding and cataloguing rebirths

  • Only about 20% of citations with enough history have had a true death and rebirth.
  • Of these 20% of works (20,409 total), only 195 had an average yearly citation rate of at least 1 before their death.

User parameters

RATIO_CUTOFF tells the algorithm how strong deaths have to be to qualify as dead. Lower means more dead. Shoud always be less than 1.

In [1]:
database_name = 'sociology-wos'
DEATH_CUTOFF = 0.1
REBIRTH_CUTOFF = 0.75
REBIRTH_VISION = 5

imports

In [2]:
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
In [3]:
cits = get_cnt("%s.doc" % database_name,['c.fy','c'])
Loaded keys: dict_keys(['c.fy', 'c'])
Available keys: ['c', 'c.fj', 'c.fy', 'c.fy.j', 'fa', 'fa.c', 'fa.fj', 'fa.fj.fy', 'fa.fy', 'fj', 'fj.fy', 'fj.ta', 'fj.ty', 'fy', 'fy.ta', 'fy.ty', 'ta', 'ty']
In [4]:
cysum = load_variable("%s.cysum" % database_name)
In [5]:
cy = defaultdict(lambda:defaultdict(int))

for (c,y),count in cits['c.fy'].items():
    if c not in cysum:
        continue
    cy[c][y] = count
In [6]:
comebacks = []
filt = False

if filt:
    blacklist = set(cb.name)
    
masterc = defaultdict(int)

for ci,c in enumerate(cy):

    if filt and c not in blacklist:
        masterc['filtered'] += 1
        continue
    
    # give 5 diagnostic accounts of what's happening while processing.
    if (ci+1) % int(len(cy)/5) == 0:
        print("Processed %s docs." % ci)
        print(dict(masterc))
        print(len(comebacks))

    # the time trend of citation counts. `count` is a dictionary which maps from year to citation count
    count = cy[c]
    
    # publication date of cited work
    pub = cysum[c]['pub']
    
    # limit the time-range 
    # we consider to the publication date, or the first year after that, to the last year of the series.
    years = list(range(max(min(count),pub), max(count)+1))
    
    # transform count into a LIST
    count = [count[y] if y in count else 0 for y in years]
    
    masterc['looped'] += 1
    
    # we need at least 15 years of data (5 years alive, 5 years dead, 5 years alive)
    if len(count) < 10 + REBIRTH_VISION:
        masterc['1. not long enough'] += 1
        continue
        
    masterc['1. long enough'] += 1
    
    ratio = []

    # compare the average citations per year
    #   1) between `first` and `cutpoint`
    #   2) 5 years after `cutpoint`
    
    for cutpoint in range(5,len(count)-5):
        bef = np.mean(count[:cutpoint])
        cur = np.mean(count[cutpoint:cutpoint+5])
        ratio.append(cur/bef)
    ratio = np.array(ratio)
    
    # these are the indices of every time cur/bef < RATIO_CUTOFF
    death_dates, = np.where( ratio < DEATH_CUTOFF )
                
    if not len(death_dates):
        masterc['1. long enough and didn\'t die'] += 1
        continue
        
    masterc['1. long enough and did die'] += 1

    # find the first date where the ratio is less than the cutoff
    first_death = np.min(death_dates)
    
    # find the local minimum near this point
    while first_death+1 < len(ratio) and ratio[first_death+1] < ratio[first_death]:
        first_death += 1
    
    first_death += 5 # `ratio` is not the same length as `counts`, we need to incrememnt the index by 5
    
    # second ratio, a post-rebirth to before death
    ratio = []
    # at least 5 yrs after the death, rebirth possibilities up till 5 years before end of sequence
    for cutoff in range(first_death+5, len(count)-5):
        future = np.mean(count[cutoff:cutoff+5])
        before_death = np.mean(count[:first_death])
        
        ratio.append(future/before_death)
    ratio = np.array(ratio)
        
    if not len(ratio):
        masterc['1.2 died but can"t tell if reborn'] += 1
        continue
        
    rebirths, = np.where( ratio >= REBIRTH_CUTOFF )
    rebirths += 5 # add 5 to these indices to get indices in the original `count` variable
    
    if not len(rebirths):
        masterc['2. died but wasn"t reborn'] += 1
        continue
        
    first_rebirth = np.min( rebirths ) + first_death
    
    masterc['3. died and came back hard'] += 1
    
    comebacks.append({
        "name": c,
        "d1": first_death+years[0],
        "rb": first_rebirth+years[0],
        "bef":np.mean(count[:first_death]),
        "aft":np.mean(count[first_rebirth:first_rebirth+5]),
        "pub":pub,
        "davg":np.mean(count[first_death:first_rebirth])
    })
Processed 52992 docs.
{'looped': 52992, '1. long enough': 26400, "1. long enough and didn't die": 16259, '1. long enough and did die': 10141, '3. died and came back hard': 5332, '1. not long enough': 26592, '2. died but wasn"t reborn': 2621, '1.2 died but can"t tell if reborn': 2188}
5332
c:\users\amcga\envs\citation-deaths\lib\site-packages\ipykernel_launcher.py:52: RuntimeWarning: divide by zero encountered in double_scalars
Processed 105985 docs.
{'looped': 105985, '1. long enough': 46259, "1. long enough and didn't die": 22131, '1. long enough and did die': 24128, '3. died and came back hard': 11470, '1. not long enough': 59726, '2. died but wasn"t reborn': 7386, '1.2 died but can"t tell if reborn': 5272}
11470
Processed 158978 docs.
{'looped': 158978, '1. long enough': 58060, "1. long enough and didn't die": 22644, '1. long enough and did die': 35416, '3. died and came back hard': 16274, '1. not long enough': 100918, '2. died but wasn"t reborn': 13231, '1.2 died but can"t tell if reborn': 5911}
16274
Processed 211971 docs.
{'looped': 211971, '1. long enough': 79576, "1. long enough and didn't die": 23447, '1. long enough and did die': 56129, '3. died and came back hard': 22293, '1. not long enough': 132395, '2. died but wasn"t reborn': 24751, '1.2 died but can"t tell if reborn': 9085}
22293
Processed 264964 docs.
{'looped': 264964, '1. long enough': 93606, "1. long enough and didn't die": 25030, '1. long enough and did die': 68576, '3. died and came back hard': 25915, '1. not long enough': 171358, '2. died but wasn"t reborn': 30884, '1.2 died but can"t tell if reborn': 11777}
25915
In [16]:
# we save this to our variables repository as a dataframe
cb = pd.DataFrame.from_records(comebacks)
In [17]:
# these are useful attributes for many analyses.

# ratio before death and after rebirth
cb['rat'] = cb.aft / cb.bef
# how long the drought was
cb['dlen'] = cb.rb-cb.d1

save variable

In [9]:
save_variable("%s.reborn" % database_name, cb)

summary statistics. not necessary to generate variable

In [10]:
tab = pd.DataFrame.from_records([
    (k, 100*v/masterc['1. long enough'])
    for k,v in masterc.items()
])
tab.columns = ["counter", "percent"]
tab
Out[10]:
counter percent
0 looped 283.068393
1 1. long enough 100.000000
2 1. long enough and didn't die 26.739739
3 1. long enough and did die 73.260261
4 3. died and came back hard 27.685191
5 1. not long enough 183.068393
6 2. died but wasn"t reborn 32.993612
7 1.2 died but can"t tell if reborn 12.581458
In [11]:
tab = pd.DataFrame.from_records([
    (k, 100*v/(
        masterc['1. long enough and did die']
    ))
    for k,v in masterc.items()
])
tab.columns = ["counter", "percent"]
tab
Out[11]:
counter percent
0 looped 386.387366
1 1. long enough 136.499650
2 1. long enough and didn't die 36.499650
3 1. long enough and did die 100.000000
4 3. died and came back hard 37.790189
5 1. not long enough 249.887716
6 2. died but wasn"t reborn 45.036164
7 1.2 died but can"t tell if reborn 17.173647
In [12]:
tab = pd.DataFrame.from_records([
    (k, 100*v/(
        masterc['1. long enough and did die']-
        masterc['1.2 died but can"t tell if reborn']
    ))
    for k,v in masterc.items()
])
tab.columns = ["counter", "percent"]
tab
Out[12]:
counter percent
0 looped 466.502931
1 1. long enough 164.802197
2 1. long enough and didn't die 44.067677
3 1. long enough and did die 120.734520
4 3. died and came back hard 45.625803
5 1. not long enough 301.700734
6 2. died but wasn"t reborn 54.374197
7 1.2 died but can"t tell if reborn 20.734520
In [13]:
tab = pd.DataFrame.from_records([
    (k, v)
    for k,v in masterc.items()
])
tab.columns = ["counter", "percent"]
tab
Out[13]:
counter percent
0 looped 264969
1 1. long enough 93606
2 1. long enough and didn't die 25030
3 1. long enough and did die 68576
4 3. died and came back hard 25915
5 1. not long enough 171363
6 2. died but wasn"t reborn 30884
7 1.2 died but can"t tell if reborn 11777
In [14]:
len(cb)
Out[14]:
25915
In [15]:
cb[(cb.bef>1)].sort_values("aft", ascending=False)
Out[15]:
name d1 rb bef aft pub davg rat dlen
3119 kalleberg|1981 1997 2010 5.937500 3.2 1981 1.000000 0.538947 13
1924 durkheim|1995 2003 2009 2.750000 3.2 1995 0.000000 1.163636 6
337 hill|1990 2003 2009 1.615385 2.8 1990 0.000000 1.733333 6
11299 ajzen|1980 2003 2011 3.894737 2.8 1980 0.125000 0.718919 8
7476 fowler|1979 1989 1994 1.125000 2.8 1979 0.000000 2.488889 5
... ... ... ... ... ... ... ... ... ...
12056 day|1996 2007 2014 1.142857 0.6 1996 0.000000 0.525000 7
12032 holland|1976 2004 2011 1.071429 0.6 1976 0.000000 0.560000 7
2634 fullerton|1982 1994 2002 1.090909 0.6 1982 0.000000 0.550000 8
2636 parcel|1991 1997 2006 1.200000 0.6 1991 0.111111 0.500000 9
6851 jacobson|1987 1998 2010 1.111111 0.6 1987 0.083333 0.540000 12

1134 rows × 9 columns

In [ ]: