import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
instructions carefully set the user parameters and run the entire notebook
output The variable is called "<database_name>.<dtype>.ysum"
. For example, "sociology-wos.ta.ysum"
description The following notebook performs analyses on the time trend of each "item" of type dtype
("ta"
for cited author, "c"
for cited work, or "t"
for cited term).
These statistics are universally useful, and computing them all once saves code and computation time in later analyses.
It also standardizes common operations, reducing the probability of a bug going unnoticed.
database_name
You should already have a database of counts to use this notebook, e.g. "sociology-wos"
.
dtype
The item of interest (e.g. "ta"
for cited author, "c"
for cited work, or "t"
for cited term)
database_name = "sociology-wos-all"
dtype = 'c'
D_CUTOFFS = [0, 0.1, 0.2, 0.3, 0.5]
RB_CUTOFFS = [0.3, 0.5, 1, 2]
compute_deaths = True
# Parameters
database_name = "sociology-wos"
dtype = "ffa"
pubyears = None
if 'wos' in database_name and 'jstor' in database_name:
raise Exception("Please put 'wos' or 'jstor' but not both in any database_name.")
elif 'wos' in database_name:
pubyears = load_variable("%s.pubyears" % database_name)
print("Pubyears loaded for %s entries" % len(pubyears.keys()))
RELIABLE_DATA_ENDS_HERE = 2019
database_type = 'wos'
elif 'jstor' in database_name:
RELIABLE_DATA_ENDS_HERE = 2010
database_type = 'jstor'
else:
raise Exception("Please include either 'wos' or 'jstor' in the name of the variable. This keys which data processing algorithm you used.")
class ysums:
def __init__(self):
self.ysums = []
self.meta = defaultdict(int)
def add(self, x):
self.ysums.append(x)
class ysum:
init_stack = []
def __init__(self, **kwargs):
self.dict_filter = set(list(self.__dict__)+["dict_filter"]) # save dict keys
for k,v in kwargs.items():
setattr(self,k,v)
for name in self.init_stack:
getattr(self,name)()
def dump(self):
to_return_keys = set(self.__dict__).difference(self.dict_filter)
return {
k: getattr(self,k)
for k in to_return_keys
if k[0] != '_'
}
@classmethod
def addinit(cls, st):
if st not in cls.init_stack:
cls.init_stack.append(st)
ysum.cits = get_cnt("%s.doc"%database_name, [comb(dtype,'fy'),'fy',dtype])
The following reorganizes the data slightly, for efficiency
ysum.cy = defaultdict(lambda:defaultdict(int))
for cross,count in ysum.cits[ comb(dtype, 'fy') ].items():
ysum.cy[ getattr(cross, dtype) ][ cross.fy ] = count
ysum.addinit("simple_stats")
class ysum(ysum):
def simple_stats(self):
self._c = self.cy[ self.name ]
self._p = {
y: count / self.cits['fy'][(y,)]
for y,count in self._c.items()
if count > 0 and self.cits['fy'][(y,)] > 0
}
self.first = min(self._c)
self.last = max(self._c)
self.maxcounty = max(self._c, key=lambda y:(self._c[y],y))
self.maxpropy = max(self._p, key=lambda y:(self._p[y],y))
self.maxprop = self._p[ self.maxpropy ]
self.maxcount = self._c[ self.maxcounty ]
self.total = sum(self._c.values())
self.totalprop = sum(self._p.values())
if dtype == 'c':
# extracts some extra information from the name
self.type = 'article'
if database_type == 'wos':
sp = self.name.split("|")
try:
self.pub = int(sp[1])
self.type = 'article'
except ValueError:
self.type = 'book'
self.pub = pubyears[self.name]
elif database_type == 'jstor':
inparens = re.findall(r'\(([^)]+)\)', self.name)[0]
self.pub = int(inparens)
def sum_between(self, A, B): #not including B
if A > B:
raise Exception("Can only sum forwards in time. Fix your code...")
return sum( c for y,c in self._c.items() if B > y >= A )
death$_1$ is the last year of maximum citations,
as long as it's before RELIABLE_DATA_ENDS_HERE
#ysum.addinit("_death1")
class ysum(ysum):
def _death1(self):
# DEFINING DEATH1
# death1 is max, as long as it's before RELIABLE_DATA_ENDS_HERE
if self.maxpropy <= RELIABLE_DATA_ENDS_HERE:
self.death1 = self.maxcounty
return
self.death1 = None
death$_2$
#ysum.addinit("_death2")
class ysum(ysum):
def _death2(self):
# this list has an entry for each year after and including the maximum citations ever received (the last time)
# look ahead to the next ten years and take the average
next_year_sums = [
(ycheck, self.sum_between(ycheck, ycheck+9))
for ycheck in range(self.maxcounty, RELIABLE_DATA_ENDS_HERE - 10)
]
# need to make sure ALL subsequent decade intervals are also less...
my_death_year = None
l = len(next_year_sums)
for i in range(l):
not_this_one = False
for j in range(i,l):
if next_year_sums[j][1] >= self.maxcount:
not_this_one = True
if not_this_one:
break
if not_this_one:
continue
my_death_year = next_year_sums[i][0]
break
if not len(next_year_sums):
self.death2 = None
else:
self.death2 = my_death_year
return None
#ysum.addinit("_death3")
class ysum(ysum):
def _death3(self):
# DEATH3 is last, as long as it's before RELIABLE_DATA_ENDS_HERE - 10
# no citations in the last 10 years
self.death3 = None
if self.last <= RELIABLE_DATA_ENDS_HERE - 10:
self.death3 = self.last
#ysum.addinit("_death5")
class ysum(ysum):
def _death5(self):
# DEATH5
# 90% of their citations were received before death4, and it's been at least 30% of their lifespan
myspan = np.array( [self.cits[ comb(dtype, 'fy')][ make_cross({dtype:self.name, 'fy':ycheck}) ] for ycheck in range(1900, 2020)] )
self.death5 = None
Ea = np.sum(myspan)
csum = np.sum(myspan)
nonzeroyears = list(np.where(myspan>0))
if not len(nonzeroyears):
self.death5 = None
return
try:
firsti = np.min(nonzeroyears)
except:
print("some strange error, that shouldn't happen, right??")
return
first_year = firsti + 1900
for cci, cc in enumerate(myspan[firsti:]):
this_year = first_year+cci
# running residual...
Ea -= cc
# don't let them die too soon
if cc == 0:
continue
if Ea/csum < 0.1 and (RELIABLE_DATA_ENDS_HERE - this_year)/(RELIABLE_DATA_ENDS_HERE - first_year) > 0.3:
self.death5 = this_year
break
if compute_deaths:
ysum.addinit("_deathN")
class ysum(ysum):
def _deathN(self):
def getname(CUTOFF):
return "death_%d"%(CUTOFF*10)
def getrname(dCUTOFF, rbCUTOFF):
return "rebirth_%d_%d"%(dCUTOFF*10, rbCUTOFF*10)
# DEATH5
# 90% of their citations/year were received before death4, and it's been at least 30% of their lifespan
# preset all deaths as None
for CUTOFF in D_CUTOFFS:
setattr(self, getname(CUTOFF), None)
for dCUTOFF in D_CUTOFFS:
for rbCUTOFF in RB_CUTOFFS:
setattr(self, getrname(dCUTOFF, rbCUTOFF), None)
for y in range(self.maxcounty + 1, RELIABLE_DATA_ENDS_HERE):
#if (RELIABLE_DATA_ENDS_HERE - y)/(RELIABLE_DATA_ENDS_HERE - self.first) < 0.3:
# # eventually we just can't tell anymore...
# continue
if RELIABLE_DATA_ENDS_HERE - y <= 10:
continue
# death
before = self.sum_between(self.first, y) / (y - self.first)
after = self.sum_between(y,y+10) / 10
for CUTOFF in D_CUTOFFS:
myname = getname(CUTOFF)
if getattr(self, myname) is not None:
continue
if after/before <= CUTOFF:
setattr(self, myname, y)
# rebirth
after = self.sum_between(y,y+10) / 10
for D_CUTOFF in D_CUTOFFS:
mydname = getname(D_CUTOFF)
for RB_CUTOFF in RB_CUTOFFS:
myrname = getrname(D_CUTOFF,RB_CUTOFF)
mydeath = getattr(self, mydname)
if mydeath is None: # if there's no death, there can't be a rebirth
continue
if not( y >= mydeath+10 ): # at least 10 years dead!
continue
before = self.sum_between(self.first, mydeath) / (mydeath - self.first)
if after/before > RB_CUTOFF:
if getattr(self, myrname) is None:
y = min( yy for yy,c in self._c.items() if yy >= y and c > 0 ) # push the rebirth to the next citation...
setattr(self, myrname, y)
# label those on the cusp explicitly (look at y=RELIABLE_DATA_ENDS_HERE-10)
# if they wouldn't die that year, mark them as True.
# if they would die in any year after, mark them as None
print("Processing database '%s'"%database_name)
mysums = ysums()
for ci,item in enumerate(ysum.cy):
if ci % 5000 == 0:
print("Item %s" % ci)
if ci % 50000 == 0:
print(mysums.meta)
#if ci > 10000:
# break
res = {}
mysums.meta['at least one citation'] += 1
if database_type == 'wos' and dtype == 'c':
sp = item.split("|")
if len(sp) < 2:
mysums.meta['not enough parts'] += 1
continue
mysum = ysum( name=item )
#small error catch
if hasattr(mysum,'pub') and mysum.first < mysum.pub:
mysums.meta['first citation before pub date'] += 1
continue
# don't care about those with only a single citation
if mysum.total <= 1:
mysums.meta['literally 1 citation. dropped.'] += 1
continue
# we really don't care about those that never rise in use
if mysum.first == mysum.maxpropy:
mysums.meta['never rise'] += 1
continue
mysums.meta['passed tests pre-blacklist'] += 1
mysums.add(mysum)
for ys in mysums.ysums[:10]:
dd = ys.dump()
for k in sorted(dd):
print(k, dd[k])
print("----------------------------")
for i in range(0,10):
rbname = "rebirth_5_%s" % i
for x in mysums.ysums:
if not hasattr(x,rbname):
setattr(x,rbname,None)
sorted(mysums.ysums[0].dump())
sum(x.rebirth_5_3 is not None for x in mysums.ysums)
print(mysums.meta)
if False:
mysum_final = pd.DataFrame.from_records( list(x.dump() for x in mysums.ysums) )
mysum_final.shape
if True:
mysum_final = {}
for x in mysums.ysums:
dp = x.dump()
mysum_final[dp['name']] = dp
varname = "%s.%s.ysum"%(database_name,dtype)
save_variable(varname, mysum_final)
if True:
save_variable("%s.included_citations"%database_name, set(x.name for x in mysums.ysums))