import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
showdocs("top1")
Just pick the database database_name
and the type of count atom you want to analyze (e.g. "ta"
for cited author, "c"
for cited work, etc.)
Note that "t"
is only available for jstor
databases
database_name = 'sociology-wos'
ctype = 'ta'
top_percentile = 0.01
# Parameters
database_name = "sociology-wos"
ctype = "fa"
cysum = load_variable("%s.%s.ysum" % (database_name,ctype))
cits = get_cnt("%s.doc" % database_name, [comb(ctype,'fy')])
any("-" in x for x in cysum)
all_tops = set()
print("%s total entries" % len(cysum))
# ranges loop from 1940-1950 to 1980-1990, in 1-year increments
for RANGE_START, RANGE_END in zip(
range(1940,1980+1,1),
range(1950,1990+1,1),
):
# create a copy of cysum
cysum_copy = {k:dict(v) for k,v in cysum.items()}
count_in_range = defaultdict(int)
for cross, count in cits[comb(ctype,'fy')].items():
if RANGE_END >= cross.fy >= RANGE_START:
count_in_range[ getattr(cross, ctype) ] += count
counts = list(count_in_range.values())
if not len(counts):
print("Skipping %s" % RANGE_START)
continue
q99 = np.quantile(np.array( counts ), 1-top_percentile)
top1 = {k for k in count_in_range if count_in_range[k]>=q99}
all_tops.update(top1)
print("%s /%s in the top %0.1f%% in %s,%s (%s total accumulated)" % (
len(top1),
len(count_in_range),
top_percentile*100,
RANGE_START, RANGE_END,
len(all_tops)
))
alldf = pd.DataFrame.from_records([
c
for name, c in cysum.items()
if name in all_tops
])
alldf.fillna(value=np.nan, inplace=True)
print(alldf.shape)
alldf.shape
alldf.sort_values("total", ascending=False).head()
save_variable("%s.%s.top1" % (database_name,ctype), alldf)
save_variable("%s.%s.top1" % (database_name,ctype), alldf)