RATIO_CUTOFF
tells the algorithm how strong deaths have to be to qualify as dead. Lower means more dead. Shoud always be less than 1.
database_name = 'sociology-wos'
DEATH_CUTOFF = 0.1
REBIRTH_CUTOFF = 0.75
REBIRTH_VISION = 5
import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
cits = get_cnt("%s.doc" % database_name,['c.fy','c'])
cysum = load_variable("%s.cysum" % database_name)
cy = defaultdict(lambda:defaultdict(int))
for (c,y),count in cits['c.fy'].items():
if c not in cysum:
continue
cy[c][y] = count
comebacks = []
filt = False
if filt:
blacklist = set(cb.name)
masterc = defaultdict(int)
for ci,c in enumerate(cy):
if filt and c not in blacklist:
masterc['filtered'] += 1
continue
# give 5 diagnostic accounts of what's happening while processing.
if (ci+1) % int(len(cy)/5) == 0:
print("Processed %s docs." % ci)
print(dict(masterc))
print(len(comebacks))
# the time trend of citation counts. `count` is a dictionary which maps from year to citation count
count = cy[c]
# publication date of cited work
pub = cysum[c]['pub']
# limit the time-range
# we consider to the publication date, or the first year after that, to the last year of the series.
years = list(range(max(min(count),pub), max(count)+1))
# transform count into a LIST
count = [count[y] if y in count else 0 for y in years]
masterc['looped'] += 1
# we need at least 15 years of data (5 years alive, 5 years dead, 5 years alive)
if len(count) < 10 + REBIRTH_VISION:
masterc['1. not long enough'] += 1
continue
masterc['1. long enough'] += 1
ratio = []
# compare the average citations per year
# 1) between `first` and `cutpoint`
# 2) 5 years after `cutpoint`
for cutpoint in range(5,len(count)-5):
bef = np.mean(count[:cutpoint])
cur = np.mean(count[cutpoint:cutpoint+5])
ratio.append(cur/bef)
ratio = np.array(ratio)
# these are the indices of every time cur/bef < RATIO_CUTOFF
death_dates, = np.where( ratio < DEATH_CUTOFF )
if not len(death_dates):
masterc['1. long enough and didn\'t die'] += 1
continue
masterc['1. long enough and did die'] += 1
# find the first date where the ratio is less than the cutoff
first_death = np.min(death_dates)
# find the local minimum near this point
while first_death+1 < len(ratio) and ratio[first_death+1] < ratio[first_death]:
first_death += 1
first_death += 5 # `ratio` is not the same length as `counts`, we need to incrememnt the index by 5
# second ratio, a post-rebirth to before death
ratio = []
# at least 5 yrs after the death, rebirth possibilities up till 5 years before end of sequence
for cutoff in range(first_death+5, len(count)-5):
future = np.mean(count[cutoff:cutoff+5])
before_death = np.mean(count[:first_death])
ratio.append(future/before_death)
ratio = np.array(ratio)
if not len(ratio):
masterc['1.2 died but can"t tell if reborn'] += 1
continue
rebirths, = np.where( ratio >= REBIRTH_CUTOFF )
rebirths += 5 # add 5 to these indices to get indices in the original `count` variable
if not len(rebirths):
masterc['2. died but wasn"t reborn'] += 1
continue
first_rebirth = np.min( rebirths ) + first_death
masterc['3. died and came back hard'] += 1
comebacks.append({
"name": c,
"d1": first_death+years[0],
"rb": first_rebirth+years[0],
"bef":np.mean(count[:first_death]),
"aft":np.mean(count[first_rebirth:first_rebirth+5]),
"pub":pub,
"davg":np.mean(count[first_death:first_rebirth])
})
# we save this to our variables repository as a dataframe
cb = pd.DataFrame.from_records(comebacks)
# these are useful attributes for many analyses.
# ratio before death and after rebirth
cb['rat'] = cb.aft / cb.bef
# how long the drought was
cb['dlen'] = cb.rb-cb.d1
save_variable("%s.reborn" % database_name, cb)
tab = pd.DataFrame.from_records([
(k, 100*v/masterc['1. long enough'])
for k,v in masterc.items()
])
tab.columns = ["counter", "percent"]
tab
tab = pd.DataFrame.from_records([
(k, 100*v/(
masterc['1. long enough and did die']
))
for k,v in masterc.items()
])
tab.columns = ["counter", "percent"]
tab
tab = pd.DataFrame.from_records([
(k, 100*v/(
masterc['1. long enough and did die']-
masterc['1.2 died but can"t tell if reborn']
))
for k,v in masterc.items()
])
tab.columns = ["counter", "percent"]
tab
tab = pd.DataFrame.from_records([
(k, v)
for k,v in masterc.items()
])
tab.columns = ["counter", "percent"]
tab
len(cb)
cb[(cb.bef>1)].sort_values("aft", ascending=False)