import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
database_name = 'sociology-wos-all'
import string_grouper
import editdistance
# the final variable we are constructing
groups = {}
# tracks the last group-id assigned
new_gid = 0
try:
strings = list(load_variable("%s.c.ysum" % database_name))
except VariableNotFound:
print("You need to generate ysum before running this notebook.")
len(strings)
def isarticle(x):
sp = x.split("|")
if len(sp) < 2:
return False
try:
int(sp[1])
return True
except ValueError:
return False
strings = [x for x in strings if '[no title captured]' not in x]
articles = [x for x in strings if isarticle(x)]
books = [x for x in strings if not isarticle(x)]
articles[:10]
books[:10]
print("%s articles, %s books to group" % (len(articles), len(books)))
# this cell may take quite a while to run.
# on Intel i7-9700F this runs in about a minute on 185k names.
books_grouped = string_grouper.match_strings(
pd.Series(books),
number_of_processes=8,
min_similarity=0.7
)
books_grouped[(books_grouped.similarity<1-1e-8)].sort_values("similarity")
# for books, we require that the authors are no more than 1 edit from each other
# even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F
ft = defaultdict(set)
for i,r in books_grouped.iterrows():
ls = r.left_side
rs = r.right_side
if ls == rs:
continue
la = ls.split("|")[0]
ra = rs.split("|")[0]
if editdistance.eval(la,ra) > 1:
continue
ft[ls].add(rs)
ft[rs].add(ls)
print("%s books have some connection to others in a group" % len(ft))
# assigns group-ids based on the relational structure derived thus far
# the code propagates ids through the network, assuming transitivity of equality
def traverse(x, gid):
global groups
groups[x] = gid
neighbors = ft[x]
for n in neighbors:
if n not in groups:
traverse(n, gid)
for i,k in enumerate(books):
if k in groups:
continue
traverse(k, new_gid)
new_gid += 1
len(set(groups.values()))
Counter(gid for x,gid in groups.items() if len(x.split("|"))==2).most_common(10)
# this cell may take quite a while to run.
# on Intel i7-9700F this runs in five minutes on 234k entries.
articles_grouped = string_grouper.match_strings(
pd.Series(articles),
number_of_processes=8, # decrease this number to 1 or 2 for slower computers or laptops (the fan might start screaming)
min_similarity=0.8 # the similarity cutoff is tighter for articles than for books
)
articles_grouped[(articles_grouped.similarity<1-1e-8)].sort_values("similarity")
# for articles, we require that the entire citations is only 1 edit apart.
# even after limiting the comparisons necessary, this takes about 20s on Intel i7-9700F
# this cell produces the `ft` variable, which maps from each term to the set of terms equivalent. I.e., `ft[A] = {B1,B2,B3}`
ft = defaultdict(set)
for i,r in articles_grouped.iterrows():
ls = r.left_side
rs = r.right_side
if ls == rs:
continue
la = ls.split("|")[0]
ra = rs.split("|")[0]
if editdistance.eval(ls,rs) > 1:
continue
ft[ls].add(rs)
ft[rs].add(ls)
#print(ls,"|||",rs)
print("%s articles have some connection to others in a group" % len(ft))
# assigns group-ids based on the relational structure derived thus far
# the code propagates ids through the network, assuming transitivity of equality
def traverse(x, gid):
global groups
groups[x] = gid
neighbors = ft[x]
for n in neighbors:
if n not in groups:
traverse(n, gid)
for i,k in enumerate(articles):
if k in groups:
continue
traverse(k, new_gid)
new_gid += 1
# this line will break execution if there aren't as many groups assigned as we have articles and books
assert( len(articles) + len(books) == len(groups) )
len(set(groups.values()))
len(set(groups.values())) - len(articles)
len(set(groups.values())) - len(books) - len(articles)
len(books)
len(articles)
# saving the variable for later
save_variable("%s.groups" % database_name, groups)
g = load_variable("%s.groups" % database_name)
len(g)
Counter([g[x] for x in groups if len(x.split("|"))>2]).most_common(10)
len(set(g.values())) - len(g)
len(set(g.values()))
cits = get_cnt('sociology-wos.ind', ['c','c.fy'])
sum(cits['c.fy'].values())
next(gi)
mygrp
list(cits['c.fy'].items())[:5]
list(g)[:5]
g['bourdieu|distinction social c']
import json
to_print = sorted( cits['c'].items(), key=lambda x:-x[1] )[:20]
to_print = [x[0] for x in to_print]
[x for x in g if 'bourd' in x]
list(cits['c.fy'])[:5]
cits['c'][('bourdieu|logic practice',)]
to_print
printed_i = 0
checking_j = 0
while printed_i < 5:
mine = [k for k in g if g[k]==checking_j]
myvals = [cits['c'][(k,)] for k in mine]
if sum( myvals ) > 0:
print("%s (%s)" % (k,x) for x in zip(mine,myvals))
printed_i += 1
checking_j += 1