import sys; sys.path.append(_dh[0].split("knowknow")[0])
from knowknow import *
showdocs("counter")
Data can be downloaded from any Web of Science search results page:
Export -> Other File Formats
. Record Content -> Full Record and Cited References
and File Format -> Tab Delimited (Win)
.txt
files into the variable wos_base
below.#wos_base = "path/to/wos/data"
wos_base = "G:/My Drive/projects/qualitative analysis of literature/pre 5-12-2020/009 get everything from WOS"
#journal_map = {
# "journal of social forces": "social forces",\
# "studies in symbolic interaction*": "studies in symbolic interaction"
#}
name_blacklist = [
"*us", 'Press', 'Knopf', '(January', 'Co', 'London', 'Bros', 'Books', 'Wilson','[anonymous]'
]
debug = False
database_name = "sociology-wos"
RUN_PREGROUP = True
RUN_EVERYTHING = True
use_included_citations_filter = False
if use_included_citations_filter:
included_citations = load_variable("%s-all/included_citations" % database_name)
use_included_journals_filter = True
journal_keep = ["ETHNIC AND RACIAL STUDIES", "LAW & SOCIETY REVIEW", "DISCOURSE & SOCIETY", "SOCIOLOGICAL INQUIRY", "CONTRIBUTIONS TO INDIAN SOCIOLOGY", "SOCIETY & NATURAL RESOURCES", "RATIONALITY AND SOCIETY", "DEVIANT BEHAVIOR", "ACTA SOCIOLOGICA", "SOCIOLOGY-THE JOURNAL OF THE BRITISH SOCIOLOGICAL ASSOCIATION", "WORK EMPLOYMENT AND SOCIETY", "SOCIOLOGICAL METHODS & RESEARCH", "SOCIOLOGICAL PERSPECTIVES", "JOURNAL OF MARRIAGE AND FAMILY", "WORK AND OCCUPATIONS", "JOURNAL OF CONTEMPORARY ETHNOGRAPHY", "THEORY AND SOCIETY", "POLITICS & SOCIETY", "SOCIOLOGICAL SPECTRUM", "RACE & CLASS", "ANTHROZOOS", "LEISURE SCIENCES", "COMPARATIVE STUDIES IN SOCIETY AND HISTORY", "SOCIAL SCIENCE QUARTERLY", "MEDIA CULTURE & SOCIETY", "SOCIOLOGY OF HEALTH & ILLNESS", "SOCIOLOGIA RURALIS", "SOCIOLOGICAL REVIEW", "TEACHING SOCIOLOGY", "BRITISH JOURNAL OF SOCIOLOGY", "JOURNAL OF THE HISTORY OF SEXUALITY", "SOCIOLOGY OF EDUCATION", "SOCIAL NETWORKS", "ARMED FORCES & SOCIETY", "YOUTH & SOCIETY", "POPULATION AND DEVELOPMENT REVIEW", "SOCIETY", "JOURNAL OF HISTORICAL SOCIOLOGY", "HUMAN ECOLOGY", "INTERNATIONAL SOCIOLOGY", "SOCIAL FORCES", "EUROPEAN SOCIOLOGICAL REVIEW", "JOURNAL OF HEALTH AND SOCIAL BEHAVIOR", "SOCIOLOGICAL THEORY", "SOCIAL INDICATORS RESEARCH", "POETICS", "HUMAN STUDIES", "SOCIOLOGICAL FORUM", "AMERICAN SOCIOLOGICAL REVIEW", "SOCIOLOGY OF SPORT JOURNAL", "SOCIOLOGY OF RELIGION", "JOURNAL OF LAW AND SOCIETY", "GENDER & SOCIETY", "BRITISH JOURNAL OF SOCIOLOGY OF EDUCATION", "LANGUAGE IN SOCIETY", "AMERICAN JOURNAL OF ECONOMICS AND SOCIOLOGY", "ANNALS OF TOURISM RESEARCH", "SOCIAL PROBLEMS", "INTERNATIONAL JOURNAL OF INTERCULTURAL RELATIONS", "SOCIAL SCIENCE RESEARCH", "SYMBOLIC INTERACTION", "JOURNAL OF LEISURE RESEARCH", "ECONOMY AND SOCIETY", "SOCIAL COMPASS", "SOCIOLOGICAL QUARTERLY", "JOURNAL OF MATHEMATICAL SOCIOLOGY", "AMERICAN JOURNAL OF SOCIOLOGY", "REVIEW OF RELIGIOUS RESEARCH", "RURAL SOCIOLOGY", "JOURNAL FOR THE SCIENTIFIC STUDY OF RELIGION", "ARCHIVES EUROPEENNES DE SOCIOLOGIE", "CANADIAN JOURNAL OF SOCIOLOGY-CAHIERS CANADIENS DE SOCIOLOGIE"]
journal_keep = [x.lower() for x in journal_keep]
groups=None
if RUN_PREGROUP:
# loading precomputed groupings of cited books and articles, if the grouping has been generated
try:
groups = load_variable("%s-all/groups" % database_name)
ysumc = load_variable("%s-all/c.ysum" % database_name)
print("Groups successfully loaded.")
# find the most popular representation
current_c = get_cnt("%s-all/doc"%database_name, ['c'])
print("Citations successfully loaded.")
except VariableNotFound:
print("Groups don't exist yet. It's important to incorporate a fuzzy match for cited references. "
"Run this script once, then run 'trend summaries/cysum.ipynb' to generate summaries and filter out small cited works. "
"Then run 'grouping article and book names.ipynb' to generate groupings. "
"Then run this notebook again to generate counts for grouped entries. "
"And finally, to make sure cysum is up to date, run 'trend summaries/cysum.ipynb' one more time. "
)
from csv import DictReader
dcount = 0
total_inserts = 0
to_inserts = []
basedir = Path(wos_base)
# keeps track of all the years seen for grouped citations
multi_year = defaultdict(lambda: defaultdict(int))
# Instantiating counters
cnt_ind = defaultdict(lambda:defaultdict(int))
track_doc = defaultdict(lambda:defaultdict(set))
cnt_doc = defaultdict(lambda:defaultdict(int))
def cnt(term, space, doc):
if ".".join(sorted(space.split("."))) != space:
raise Exception(space, "should be sorted...")
# it's a set, yo
track_doc[space][term].add(doc)
# update cnt_doc
cnt_doc[space][term] = len(track_doc[space][term])
# update ind count
cnt_ind[space][term] += 1
# This cell ensures there are not overflow errors while importing large CSVs
import sys
import csv
maxInt = sys.maxsize
while True:
# decrease the maxInt value by factor 10
# as long as the OverflowError occurs.
try:
csv.field_size_limit(maxInt)
break
except OverflowError:
maxInt = int(maxInt/10)
def fixcitedauth(a):
a = a.strip()
if not len(a):
return None
sp = a.lower().split()
if len(sp) < 2:
return None
if len(sp) >= 5:
return None
l, f = a.lower().split()[:2] # take first two words
if len(l) == 1: # sometimes last and first name are switched for some reason
l, f = f, l
f = f[0] + "." # first initial
a = ", ".join([l, f]) # first initial
a = a.title() # title format, so I don't have to worry later
if debug:
print('cited author:', a)
return a
def fix_refs(refs):
for r in refs:
yspl = re.split("((?:18|19|20)[0-9]{2})", r)
if len(yspl) < 2:
continue
auth, year = yspl[:2]
auth = auth.strip()
if len(auth.split(" ")) > 5:
continue
year = int(year)
if auth == "":
continue
auth = fixcitedauth( auth )
if auth is None: # catching non-people, and not counting the citations
continue
full_ref = r
if 'DOI' not in full_ref and not ( # it's a book!
len(re.findall(r', [Vv][0-9]+', full_ref)) or
len(re.findall(r'[0-9]+, [Pp]?[0-9]+', full_ref))
):
#full_ref = re.sub(r', [Pp][0-9]+', '', full_ref) # get rid of page numbers!
full_ref = "|".join( # splits off the author and year, and takes until the next comma
[auth]+
[x.strip().lower() for x in full_ref.split(",")[2:3]]
)
else: # it's an article!
# just adds a fixed name and date to the front
full_ref = "|".join(
[auth, str(year)] +
[",".join( x.strip() for x in full_ref.lower().split(",")[2:] ).split(",doi")[0]]
)
if use_included_citations_filter:
if full_ref not in included_citations:
continue
# implement grouping of references
if groups is not None:
if full_ref in groups:
# retrieves retrospectively-computed groups
full_ref = group_reps[
groups[full_ref]
]
elif full_ref in ysumc and '[no title captured]' not in full_ref:
# a small minority, the ones which are dropped in this process anyways
#print('not in grouping')
raise Exception(full_ref, "not in grouping!")
else:
continue
if debug:
print('fix_refs_worked',auth,year,full_ref)
yield ( auth, year, full_ref )
if RUN_PREGROUP:
# loading precomputed groupings of cited books and articles, if the grouping has been generated
if groups is None:
print("Groups don't exist yet. It's important to incorporate a fuzzy match for cited references."
"Run this script once, then run 'trend summaries/cysum.ipynb' to generate summaries and filter out small cited works."
"Then run 'grouping article and book names.ipynb' to generate groupings."
"Then run this notebook again to generate counts for grouped entries."
"And finally, to make sure cysum is up to date, run 'trend summaries/cysum.ipynb' one more time."
)
raise Exception("Grouperror")
from collections import defaultdict
def get_reps(groups):
ret = defaultdict(set)
for k,v in groups.items():
ret[v].add(k)
ret = {
k: max(v, key=lambda x:current_c['c'][x])
for k,v in ret.items()
}
return ret
group_reps = get_reps(groups)
# processes WoS txt output files one by one, counting relevant cooccurrences as it goes
dcount=0
for i, f in enumerate( list(basedir.glob("**/*.txt")) ):
with f.open(encoding='utf8') as pfile:
r = DictReader(pfile, delimiter="\t")
rows = list(r)
if i % 50 == 0:
print("File %s/%s: %s" % (i, len(list(basedir.glob("**/*.txt"))),f.name))
for i, r in enumerate(rows):
if r['DT'] != "Article":
continue
#print(refs)
dcount += 1
if dcount % 10000 == 0:
print("Document: %s" % dcount)
print("Citations: %s" % len(cnt_doc['c']))
if debug:
print("DOCUMENT %s" % dcount)
if dcount > 10:
raise
refs = r["CR"].strip().split(";")
refs = list( fix_refs(refs) )
if not len(refs):
continue
def fixcitingauth():
authors = r['AU'].split(";")
for x in authors:
x = x.strip().lower()
x = re.sub("[^a-zA-Z\s,]+", "", x) # only letters and spaces allowed
xsp = x.split(", ")
if len(xsp) < 2:
return xsp[0]
elif len(xsp) > 2:
raise Exception("author with too many commas", x)
f, l = xsp[1], xsp[0]
f = f[0] # take only first initial of first name
yield "%s, %s" % (l,f)
citing_authors = list(fixcitingauth())
if not len(citing_authors):
continue
if debug:
print("citing authors: ", citing_authors)
if False:
for i in range(10):
print("-"*20)
uid = r['UT']
try:
int(r['PY'])
except ValueError:
continue
r['SO'] = r['SO'].lower() # REMEMBER THIS! lol everything is in lowercase... not case sensitive
if use_included_journals_filter and r['SO'].lower() not in journal_keep:
continue
for (auth,year,full_ref) in refs:
ref = (auth,year)
if ref[0] in name_blacklist:
continue
if "*" in ref[0]:
continue
# BEGIN COUNTING!!!
multi_year[full_ref][year] += 1
cnt(r['SO'], 'fj', uid)
cnt(int(r['PY']), 'fy', uid)
cnt(ref[1], 'ty', uid)
cnt((full_ref, int(r['PY'])), 'c.fy', uid)
cnt((full_ref, r['SO']), 'c.fj', uid)
cnt((r['SO'],int(r['PY'])), 'fj.fy', uid)
cnt(full_ref, 'c', uid)
if not RUN_EVERYTHING:
continue
cnt((int(r['PY']), year), 'fy.ty', uid)
cnt((r['SO'], year), 'fj.ty', uid)
cnt(auth, 'ta', uid)
cnt((int(r['PY']),auth), 'fy.ta', uid)
cnt((r['SO'],auth), 'fj.ta', uid)
# first author!
ffa = citing_authors[0]
cnt(ffa, 'ffa', uid)
cnt((ffa,int(r['PY'])), 'ffa.fy', uid)
cnt((ffa,r['SO']), 'ffa.fj', uid)
cnt((full_ref,ffa), 'c.ffa', uid)
#cnt((ffa,r['SO'], int(r['PY'])), 'ffa.fj.fy', uid)
for a in citing_authors:
cnt(a, 'fa', uid)
cnt((a,int(r['PY'])), 'fa.fy', uid)
cnt((a,r['SO']), 'fa.fj', uid)
#cnt((a,r['SO'], int(r['PY'])), 'fa.fj.fy', uid)
cnt((full_ref,a), 'c.fa', uid)
cnt((full_ref, int(r['PY']), r['SO']), 'c.fy.j', uid)
r['TI']
full_ref
citing_authors
Because there are so many cited works, a full cocitation network would be prohibitively large to compute and store on disk.
Furthermore, the full network is not very useful.
The following creates a cocitation network among only the most common 1000 cited works.
It counts the number of times cnt['cc'][ (c1,c2) ]
that c1
and c2
are cited together in a work.
The ind
and doc
counters are identical for this counter, 'cc'
.
if RUN_EVERYTHING:
allowed_refs = Counter(dict(cnt_ind['c'].items())).most_common(1000)
allowed_refs = set( x[0] for x in allowed_refs )
print("# allowed references for cocitation analysis: %s" % len(allowed_refs))
print("Examples: %s" % str(list(allowed_refs)[:3]))
# enumerating cocitation for works with at least 10 citations
dcount = 0
refcount = 0
for i, f in enumerate(list(basedir.glob("**/*.txt"))):
with f.open(encoding='utf8') as pfile:
r = DictReader(pfile, delimiter="\t")
rows = list(r)
if i % 50 == 0:
print("File %s/%s: %s" % (i, len(list(basedir.glob("**/*.txt"))), f))
for i, r in enumerate(rows):
if r['DT'] != "Article":
continue
refs = r["CR"].strip().split(";")
refs = list(fix_refs(refs))
if not len(refs):
continue
uid = r['UT']
try:
int(r['PY'])
except ValueError:
continue
for (auth,year,full_ref) in refs:
if full_ref not in allowed_refs:
continue
for (auth2,year2,full_ref2) in refs:
if full_ref2 <= full_ref:
continue
if full_ref2 not in allowed_refs:
continue
cnt((full_ref,full_ref2), 'c.c', uid)
cnt((year,year2), 'ty.ty', uid)
refcount += 1
if refcount % 10000 == 0:
print("%s cocitations logged" % refcount)
print("Finished!")
Because of the huge number of cited works that only occur once, for efficiency it's best to simply log this statistic and eliminate them from the counter.
This might not be necessary
trimCounters = False
if trimCounters:
onlyOne = len([x for x in cnt_doc['c'] if cnt_doc['c'][x] == 1])
total = len(cnt_doc['c'])
print("Of the %s cited works, %s (%0.2f%%) have only a single citation" % (
total, onlyOne,
100 * onlyOne/total
))
terms = list(cnt_doc['c'].keys())
counts = np.array([cnt_doc['c'][k] for k in terms])
cutoff = 2
to_remove = set([terms[int(i)] for i in np.argwhere(counts < cutoff)])
print("consolidating", len(to_remove), list(to_remove)[:5])
cnt_doc.keys()
print("old size:", len(cnt_doc['c']))
for tr in to_remove:
del cnt_doc['c'][tr]
del cnt_ind['c'][tr]
print("new size:", len(cnt_doc['c']))
print("old size:", len(cnt_doc['cy']))
tydels = [x for x in cnt_doc['cy'] if x[0] in to_remove]
for tydel in tydels:
del cnt_doc['cy'][tydel]
del cnt_ind['cy'][tydel]
print("new size:", len(cnt_doc['cy']))
print("old size:", len(cnt_doc['jc']))
jtdels = [x for x in cnt_doc['jc'] if x[1] in to_remove]
for jtdel in jtdels:
del cnt_doc['jc'][jtdel]
del cnt_ind['jc'][jtdel]
print("new size:", len(cnt_doc['jc']))
print(len(cnt_doc['c']), 'cited works')
# retrieve and use the MOST COMMON pub date for each
pubyears = {
k:max(s.keys(), key=lambda x:multi_year[k][x]) for k,s in multi_year.items()
if len(s)
}
varname = "%s.pubyears"%database_name
save_variable(varname, pubyears)
print("saved %s" % varname)
save_cnt("%s/doc"%database_name, cnt_doc)
save_cnt("%s/ind"%database_name, cnt_ind)