#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# pylint: disable-msg=C0321
# See the LICENSE.rst file for licensing information.
'''
Bibulous is a drop-in replacement for BibTeX, with the primary advantage that the bibliography
template format is compact and *very* easy to modify.
The basic program flow is as follows:
(1) Read the .aux file and get the names of the bibliography databases (.bib files),
the style templates (.bst files) to use, and the entire set of citations.
(2) Read in all of the bibliography database files into one long dictionary (`bibdata`),
replacing any abbreviations with their full form. Cross-referenced data is *not* yet
inserted at this point. That is delayed until the time of writing the BBL file in order
to speed up parsing.
(3) Read in the Bibulous style template file as a dictionary (`bstdict`).
(4) Now that all the information is collected, go through each citation key, find the
corresponding entry key in `bibdata`. From the entry type, select a template from `bstdict`
and begin inserting the variables one-by-one into the template. If any data is missing,
check for cross-references and use crossref data to fill in missing values.
'''
from __future__ import unicode_literals, print_function, division ## for Python3 compatibility
import re
import os
import sys
import codecs ## for importing UTF8-encoded files
import locale ## for language internationalization and localization
import getopt ## for getting command-line options
import traceback ## for getting full traceback info in exceptions
import pdb ## put "pdb.set_trace()" at any place you want to interact with pdb
#def info(type, value, tb):
# #if (not sys.stderr.isatty() or
# # not sys.stdin.isatty()):
# # ## stdin or stderr is redirected, just do the normal thing
# # original_hook(type, value, tb)
# #else:
# if True:
# ## a terminal is attached and stderr is not redirected, debug
# traceback.print_exception(type, value, tb)
# print('')
# pdb.pm()
# #traceback.print_stack()
#sys.excepthook = info ## go into debugger automatically on an exception
__version__ = '1.0'
__all__ = ['get_bibfilenames', 'sentence_case', 'stringsplit', 'finditer', 'namefield_to_namelist',
'namedict_to_formatted_namestr', 'initialize_name', 'get_delim_levels',
'get_quote_levels', 'splitat', 'multisplit', 'enwrap_nested_string',
'enwrap_nested_quotes', 'purify_string', 'latex_to_utf8', 'parse_bst_template_str',
'namestr_to_namedict', 'search_middlename_for_prefixes', 'create_edition_ordinal',
'export_bibfile', 'parse_pagerange', 'parse_nameabbrev']
[docs]class Bibdata(object):
'''
Bibdata is a class to hold all data related to a bibliography database, a citation list, and a
style template.
To initialize the class, either call it with the filename of the ".aux" file containing the
relevant file locations (for the ".bib" database files and the ".bst" template files) or simply
call it with a list of all filenames to be used (".bib", ".bst" and ".aux"). The output file
(the LaTeX-formatted bibliography) is assumed to have the same filename root as the ".aux"
file, but with ".bbl" as its extension.
Attributes
----------
abbrevkey_pattern
abbrevs
anybrace_pattern
anybraceorquote_pattern
bibdata
bstdict
citedict
debug
endbrace_pattern
filedict
filename
i
options
quote_pattern
startbrace_pattern
Methods
-------
parse_bibfile
parse_bibentry
parse_bibfield
parse_auxfile
parse_bstfile
write_bblfile
create_citation_list
format_bibitem
generate_sortkey
create_namelist
format_namelist
insert_crossref_data
write_citeextract
write_authorextract
Example
-------
...
'''
def __init__(self, filename, debug=False):
self.debug = debug
self.abbrevs = {'jan':'1', 'feb':'2', 'mar':'3', 'apr':'4', 'may':'5', 'jun':'6',
'jul':'7', 'aug':'8', 'sep':'9', 'oct':'10', 'nov':'11', 'dec':'12'}
self.locale = locale.setlocale(locale.LC_ALL,'') ## set the locale to the user's default
self.bibdata = {'preamble':''}
self.filedict = get_bibfilenames(filename, debug=debug)
self.citedict = {} ## the dictionary containing the original data from the AUX file
self.citelist = [] ##citation keys in the ordered they will be printed in the final result
self.bstdict = {}
## Temporary variables for use in error messages while parsing files.
self.filename = '' ## the current filename (for error messages)
self.i = 0 ## counter for line in file (for error messages)
## Put in default options settings.
self.options = {}
self.options['undefstr'] = '???'
self.options['replace_newlines'] = True
self.options['namelist_format'] = 'first_name_first' #zzz
self.options['citation_order'] = 'citenum'
self.options['maxauthors'] = sys.maxint
self.options['maxeditors'] = 100
self.options['minauthors'] = 5
self.options['mineditors'] = 5
self.options['procspie_as_journal'] = False
self.options['use_firstname_initials'] = True #zzz
self.options['use_name_ties'] = False #zzz
self.options['show_urls'] = False
self.options['use_abbrevs'] = True
#self.options['hyperref'] =
self.options['backrefstyle'] = 'none'
self.options['backrefs'] = False
self.options['sort_case'] = True
self.options['french_initials'] = False
self.options['sort_with_prefix'] = False
self.options['period_after_initial'] = True #zzz
self.options['terse_inits'] = False
self.options['force_sentence_case'] = False
self.options['bibitemsep'] = None
self.options['bibextract'] = ''
self.options['month_abbrev'] = True
## Compile some patterns for use in regex searches.
self.anybrace_pattern = re.compile(r'(?<!\\)[{}]', re.UNICODE)
self.startbrace_pattern = re.compile(r'(?<!\\){', re.UNICODE)
self.endbrace_pattern = re.compile(r'(?<!\\)}', re.UNICODE)
self.quote_pattern = re.compile(r'(?<!\\)"', re.UNICODE)
self.abbrevkey_pattern = re.compile(r'(?<!\\)[,#]', re.UNICODE)
self.anybraceorquote_pattern = re.compile(r'(?<!\\)[{}"]', re.UNICODE)
## Print out some info on Bibulous and the files it is working on.
print('This is Bibulous, version ' + str(__version__))
print('The current working directory is: ' + os.getcwd())
print('The top-level TeX file: ' + str(self.filedict['tex']))
print('The top-level auxiliary file: ' + str(self.filedict['aux']))
print('The bibliography database file(s): ' + str(self.filedict['bib']))
print('The Bibulous style template file(s): ' + str(self.filedict['bst']))
print('The output formatted bibliography file: ' + str(self.filedict['bbl']))
## If "filename" is a list of filenames, then convert them one by one in order.
if self.filedict['bib']:
for f in self.filedict['bib']:
self.parse_bibfile(f)
if self.filedict['aux']:
self.parse_auxfile(self.filedict['aux'])
if self.filedict['bst']:
for f in self.filedict['bst']:
self.parse_bstfile(f, ignore_overwrite=True)
return
## =============================
[docs] def parse_bibfile(self, filename):
'''
Parse a *.bib file to generate a dictionary representing a bibliography database.
Parameters
----------
filename : str
The filename of the .bib file to parse.
'''
self.filename = filename
#filehandle = open(os.path.normpath(self.filename), 'rU')
filehandle = codecs.open(os.path.normpath(self.filename), 'r', 'utf-8')
## This next block parses the lines in the file into a dictionary. The tricky part here is
## that the BibTeX format allows for multiline entries. So we have to look for places where
## a line does not end in a comma, and check the following line to see if it a continuation
## of that line. Unfortunately, this means we need to read the whole file into memory ---
## not just one line at a time.
entry_brace_level = 0
## The variable "entrystr" contains all of the contents of the entry between the entrytype
## definition "@____{" and the closing brace "}". Once we've obtained all of the (in general
## multiline) contents, then we hand them off to parse_bibentry() to format them.
entrystr = ''
entrytype = None
self.i = 0 ## line number counter --- for error messages only
for line in filehandle:
self.i += 1
## Ignore empty and comment lines.
if not line: continue
if line.strip().startswith('%'): continue
if line.startswith('}'):
## If a line *starts* with a closing brace, then assume the intent is to close the
## current entry.
entry_brace_level = 0
self.parse_bibentry(entrystr, entrytype) ## close out the entry
entrystr = ''
if (line[1:].strip() != ''):
print('Warning: line#' + str(self.i) + ' of "' + self.filename + '" has data '
'outside of an entry {...} block. Skipping all contents until the next '
'entry ...')
#raise ValueError
continue
## Don't strip off leading and ending whitespace until after checking if '}' begins a
## line (as in the block above).
line = line.strip()
#print('line=', line) #zzz
if line.startswith('@'):
brace_idx = line.find('{') ## assume a form like "@ENTRYTYPE{"
if (brace_idx == -1):
print('Warning: open brace not found for the entry beginning on line#' + \
str(self.i) + ' of "' + self.filename + '". Skipping to next entry ...')
entry_brace_level = 0
continue
entrytype = line[1:brace_idx].lower().strip() ## extract string between "@" and "{"
line = line[brace_idx+1:]
entry_brace_level = 1
## If we are not current inside an active entry, then skip the line and wait until the
## the next entry.
if (entry_brace_level == 0):
if (line.strip() != ''):
print('Warning: line#' + str(self.i) + ' of "' + self.filename + '" has data '
'outside of an entry {...} block. Skipping all contents until the next '
'entry ...')
#raise ValueError
continue
## Look if the entry ends with this line. If so, append it to "entrystr" and call the
## entry parser. If not, then simply append to the string and continue.
endpos = len(line)
for match in re.finditer(self.anybrace_pattern, line):
if (match.group(0)[-1] == '}'):
entry_brace_level -= 1
elif (match.group(0)[-1] == '{'):
entry_brace_level += 1
if (entry_brace_level == 0):
## If we've found the final brace, then check if there is anything after it.
if (line[match.end():].strip() != ''):
print('Warning: line#' + str(self.i) + ' of "' + self.filename + '" has '
'data outside of an entry {...} block. Skipping all contents until '
'the next entry ...')
#raise ValueError
endpos = match.end()
break
if (entry_brace_level == 0):
entrystr += line[:endpos-1] ## use "-1" to remove the final closing brace
self.parse_bibentry(entrystr, entrytype)
entrystr = ''
else:
entrystr += line[:endpos] + '\n'
filehandle.close()
return
## =============================
[docs] def parse_bibentry(self, entrystr, entrytype):
'''
Given a string representing the entire contents of the BibTeX-format bibliography entry,
parse the contents and place them into the bibliography preamble string, the set of
abbreviations, and the bibliography database dictionary.
Parameters
----------
entrystr : str
The string containing the entire contents of the bibliography entry.
entrytype : str
The type of entry ("article", "preamble", etc.).
'''
if not entrystr:
return
if (entrytype == 'comment'):
pass
elif (entrytype == 'preamble'):
## In order to use the same "parse_bibfield()" function as all the other options, add a
## fake key onto the front of the string before calling "parse_bibfield()".
fd = self.parse_bibfield('fakekey = ' + entrystr)
if fd: self.bibdata['preamble'] += '\n' + fd['fakekey']
elif (entrytype == 'string'):
fd = self.parse_bibfield(entrystr)
if fd: self.abbrevs.update(fd)
else:
## First get the entry key. Then send the remainder of the entry string to the parser.
idx = entrystr.find(',')
if (idx == -1):
print('Warning: the entry ending on line #' + str(self.i) + ' of file "' + \
self.filename + '" is does not have an "," for defining the entry key. '
'Skipping ...')
return(fd)
entrykey = entrystr[:idx].strip()
entrystr = entrystr[idx+1:]
self.bibdata[entrykey] = {}
self.bibdata[entrykey]['entrytype'] = entrytype
fd = self.parse_bibfield(entrystr)
if fd: self.bibdata[entrykey].update(fd)
return
## =============================
[docs] def parse_bibfield(self, entrystr):
'''
For a given string representing the raw contents of a BibTeX-format bibliography entry,
parse the contents into a dictionary of key:value pairs corresponding to the field
names and field values.
Parameters
----------
entrystr : str
The string containing the entire contents of the bibliography entry.
'''
## TODO: this function deperately needs some simplification!
entrystr = entrystr.strip()
fd = {} ## the dictionary for holding key:value string pairs
while entrystr:
## First locate the field key.
idx = entrystr.find('=')
if (idx == -1):
print('Warning: the entry ending on line #' + str(self.i) + ' of file "' + \
self.filename + '" is an abbreviation-type entry but does not have an "=" '
'for defining the end of the abbreviation key. Skipping ...')
#raise ValueError
return(fd)
fieldkey = entrystr[:idx].strip()
fieldstr = entrystr[idx+1:].strip()
if not fieldstr:
entrystr = ''
continue
## Next we go through the field contents, which may involve concatenating. When we
## reach the end of an individual field, we return the "result string" and
## truncate the entry string to remove the part we just finished parsing.
resultstr = ''
while fieldstr:
firstchar = fieldstr[0]
if (firstchar == ','):
## Reached the end of the field, truncate the entry string return to the outer
## loop over fields.
fd[fieldkey] = resultstr
entrystr = fieldstr[1:].strip()
break
elif (firstchar == '#'):
## Reached a concatenation operator. Just skip it.
fieldstr = fieldstr[1:].strip()
elif (firstchar == '"'):
## Search for the content string that resolves the double-quote delimiter.
## Once you've found the end delimiter, append the content string to the
## result string.
endpos = len(fieldstr)
entry_brace_level = 0
for match in re.finditer(self.anybraceorquote_pattern, fieldstr[1:]):
if (match.group(0)[-1] == '}'):
entry_brace_level -= 1
elif (match.group(0)[-1] == '{'):
entry_brace_level += 1
if (match.group(0)[-1] == '"') and (entry_brace_level == 0):
endpos = match.end()
break
resultstr += ' ' + fieldstr[1:endpos]
fieldstr = fieldstr[endpos+1:].strip()
if not fieldstr: entrystr = ''
elif (firstchar == '{'):
## Search for the endbrace that resolves the brace level. Once you've found it,
## add the intervening contents to the result string.
endpos = len(fieldstr)
entry_brace_level = 1
for match in re.finditer(self.anybrace_pattern, fieldstr[1:]):
if (match.group(0)[-1] == '}'):
entry_brace_level -= 1
elif (match.group(0)[-1] == '{'):
entry_brace_level += 1
if (entry_brace_level == 0):
endpos = match.end()
break
resultstr += ' ' + fieldstr[1:endpos]
fieldstr = fieldstr[endpos+1:].strip()
if not fieldstr: entrystr = ''
else:
## If the fieldstr doesn't begin with '"' or '{' or '#', then the next set of
## characters must be an abbreviation key. An abbrev key ends with a whitespace
## followed by either '#' or ',' (or the end of the field). Anything else is a
## syntax error.
endpos = len(fieldstr)
end_of_field = False
if not re.search(self.abbrevkey_pattern, fieldstr):
## If the "abbrevkey" is an integer, then it's not actually an abbreviation.
## Convert it to a string and insert the number itself.
abbrevkey = fieldstr
if abbrevkey.isdigit():
resultstr += str(abbrevkey)
else:
if abbrevkey in self.abbrevs:
resultstr += self.abbrevs[abbrevkey].strip()
else:
print('Warning: for the entry ending on line #' + str(self.i) + \
' of file "' + self.filename + '", cannot find the '
'abbreviation key "' + abbrevkey + '". Skipping ...')
resultstr = self.options['undefstr']
fieldstr = ''
end_of_field = True
else:
for match in re.finditer(self.abbrevkey_pattern, fieldstr):
endpos = match.end()
if (match.group(0)[0] == '#'):
abbrevkey = fieldstr[:endpos-1].strip()
if abbrevkey.isdigit():
resultstr += str(abbrevkey)
elif (abbrevkey not in self.abbrevs):
print('Warning: for the entry ending on line #' + str(self.i) + \
' of file "' + self.filename + '", cannot find the '
'abbreviation key "' + abbrevkey + '". Skipping ...')
resultstr += self.options['undefstr']
else:
resultstr += self.abbrevs[abbrevkey].strip()
fieldstr = fieldstr[endpos+1:].strip()
break
elif (match.group(0)[0] == ','):
abbrevkey = fieldstr[:endpos-1].strip()
if abbrevkey.isdigit():
resultstr += str(abbrevkey)
elif (abbrevkey not in self.abbrevs):
print('Warning: for the entry ending on line #' + str(self.i) + \
' of file "' + self.filename + '", cannot find the '
'abbreviation key "' + abbrevkey + '". Skipping ...')
resultstr += self.options['undefstr']
else:
resultstr += self.abbrevs[abbrevkey].strip()
fieldstr = fieldstr[endpos+1:].strip()
end_of_field = True
break
else:
raise SyntaxError('Should never reach here. What happened?')
## Since we found the comma at the end of this field's contents, we break here
## to return to the loop over fields.
if end_of_field:
entrystr = fieldstr.strip()
break
resultstr = resultstr.strip()
if self.options['replace_newlines']: resultstr = resultstr.replace('\n',' ')
## Having braces around quotes can cause problems when parsing nested quotes, and
## do not provide any additional functionality.
if ('{"}') in resultstr:
resultstr = resultstr.replace('{"}', '"')
if ("{'}") in resultstr:
resultstr = resultstr.replace("{'}", "'")
if ('{`}') in resultstr:
resultstr = resultstr.replace('{`}', '`')
fd[fieldkey] = resultstr
return(fd)
## =============================
[docs] def parse_auxfile(self, filename, debug=False):
'''
Read in an *.aux file and convert the `\citation{}` entries found there into a dictionary
of citekeys and citation order number.
Parameters
----------
filename : str
The filename of the *.aux file to parse.
'''
if debug: print('Reading AUX file "' + filename + '" ...')
self.filename = filename
#filehandle = open(os.path.normpath(self.filename), 'rU')
filehandle = codecs.open(os.path.normpath(filename), 'r', 'utf-8')
## Commented lines below are useful if you are reading in \bibcite{...} rather than
## \citation{...}.
#for line in file.readlines():
# line = unicode(line, 'utf-8')
# if not line.startswith(r'\bibcite'): continue
# line = line.replace(r'\bibcite{','')
#
# (citekey, number) = line.split('}{')
# number = number.replace('}','').strip()
# self.citedict[citekey] = int(number)
## First go through the file and grab the list of citation keys. Once we get them all, then
## we can go through the list and figure out the numbering.
keylist = []
for line in filehandle:
line = line.strip()
if not line.startswith(r'\citation{'): continue
## Remove the "\citeation{" from the front and the "}" from the back. If multiple
## citations are given, then split them using the comma.
items = line[10:-1].split(',')
for item in items:
keylist.append(item)
filehandle.close()
## If you didn't find any citations in the file, issue a warning. Otherwise, build a
## dictionary of keys giving the citation keys with values equal to the citation order
## number.
if not keylist:
print('Warning: no citations found in AUX file "' + filename + '"')
else:
q = 1 ## citation order counter
self.citedict[keylist[0]] = q
for key in keylist[1:]:
if key in self.citedict: continue
q += 1
self.citedict[key] = q
if debug:
## When displaying the dictionary, show it in order-sorted form. Remember to use
## the user's locale for the sort.
for key in sorted(self.citedict, key=self.citedict.get, cmp=locale.strcoll):
print(key + ': ' + str(self.citedict[key]))
return
## =============================
[docs] def parse_bstfile(self, filename, ignore_overwrite=False, debug=False):
'''
Convert a Bibulous-type bibliography style template into a dictionary.
The resulting dictionary consists of keys which are the various entrytypes, and values which \
are the template strings. In addition, any formatting options are stored in the "options" key \
as a dictionary of option_name:option_value pairs.
Parameters
----------
filename : str
The filename of the Bibulous style template to use.
ignore_overwrite : bool, optional
Whether to issue a warning when overwriting an existing template variable.
'''
self.filename = filename
#filehandle = open(os.path.normpath(filename), 'rU')
filehandle = codecs.open(os.path.normpath(filename), 'r', 'utf-8')
## First try to find out if the file is a BibTeX-style BST file rather than a Bibulous one.
## These files are not normally large, so reading in the whole thing should be quick.
alllines = filehandle.readlines()
if ('EXECUTE {' in alllines) or ('EXECUTE{' in alllines):
raise ImportError('The style template file "' + filename + '" appears to be BibTeX '
'format, not Bibulous. Aborting...')
for i,line in enumerate(alllines):
## Ignore any comment lines, and remove any comments from data lines.
if line.startswith('#'): continue
if ('#' in line):
idx = line.index('#')
line = line[:idx]
if ('=' in line):
res = line.split(' = ')
if (len(res) != 2):
raise SyntaxError('The line "' + line + '" (line #' + str(i) + ') in file "' + \
filename + '" is not valid BST grammar.')
else:
(lhs,rhs) = res
var = lhs.strip()
value = rhs.strip()
## If the value is numeric or bool, then convert the datatype from string.
if value.isdigit():
value = int(value)
elif (value == 'True') or (value == 'False'):
value = (value == 'True')
if var.startswith('options.'):
## The variable defines an option rather than an entrytype.
optvar = var[8:]
if (optvar in self.options) and (self.options[optvar] != value) and not ignore_overwrite:
print('Warning: overwriting the existing template option "' + optvar + \
'" from [' + str(self.options[optvar]) + '] to [' + str(value) + '] ...')
self.options[optvar] = value
else:
## The line defines an entrytype template.
if (var in self.bstdict) and (self.bstdict[var] != value) and not ignore_overwrite:
print('Warning: overwriting the existing template variable "' + var + \
'" from [' + self.bstdict[var] + '] to [' + value + '] ...')
self.bstdict[var] = value
filehandle.close()
## Next check to see whether any of the template definitions are simply maps to one
## of the other definitions. For example, in "osa.bst" I have a line of the form
## inbook = incollection
## which implies that the template for "inbook" should be mapped to the same template
## as defined for the "incollection" entrytype.
for key in self.bstdict:
if (key == 'options'): continue
nwords = len(re.findall(r'\w+', self.bstdict[key]))
if (nwords == 1) and ('<' not in self.bstdict[key]):
self.bstdict[key] = self.bstdict[self.bstdict[key]]
if debug:
## When displaying the bst dictionary, show it in sorted form.
for key in sorted(self.bstdict, key=self.bstdict.get, cmp=locale.strcoll):
print(key + ': ' + str(self.bstdict[key]))
## =============================
[docs] def write_bblfile(self, filename=None, write_preamble=True, write_postamble=True, bibsize=None,
verbose=False):
'''
Given a bibliography database `bibdata`, a dictionary containing the citations called out \
`citedict`, and a bibliography style template `bstdict` write the LaTeX-format file for the \
formatted bibliography.
Start with the preamble and then loop over the citations one by one, formatting each entry \
one at a time, and put `\end{thebibliography}` at the end when done.
Parameters
----------
filename : str, optional
The filename of the *.bbl file to write. (Default is to take the AUX file and change \
its extension to .bbl.)
write_preamble : bool, optional
Whether to write the preamble. (Setting this to False can be useful when writing the \
bibliography in separate steps.)
write_postamble : bool, optional
Whether to write the postamble. (Setting this to False can be useful when writing the \
bibliography in separate steps.)
bibsize : str, optional
A string the length of which is used to determine the label margin for the bibliography.
'''
if (filename == None):
filename = self.filedict['bbl']
if not write_preamble:
filehandle = open(filename, 'a')
else:
filehandle = open(filename, 'w')
## Create an inverse dictionary for the month names. The month names are determined by the
## user's default locale.
monthname_dict = {}
for i in range(1,13):
if self.options['month_abbrev']:
monthname_dict[str(i)] = locale.nl_langinfo(locale.__dict__['ABMON_'+str(i)]).title()
else:
monthname_dict[str(i)] = locale.nl_langinfo(locale.__dict__['MON_'+str(i)]).title()
if write_preamble:
if not bibsize: bibsize = repr(len(self.citedict))
filehandle.write('\\begin{thebibliography}{' + bibsize + '}\n'.encode('utf-8'))
#if self.options['show_urls']: filehandle.write(r'\providecommand{\url}[1]{\texttt{#1}}')
#if not self.options['show_urls']:
# filehandle.write(r'\providecommand{\url}[1]{}') ## i.e. ignore the URL
filehandle.write("\\providecommand{\\enquote}[1]{``#1''}\n".encode('utf-8'))
filehandle.write('\\providecommand{\\url}[1]{\\verb|#1|}\n'.encode('utf-8'))
filehandle.write('\\providecommand{\\href}[2]{#2}\n'.encode('utf-8'))
if (self.options['bibitemsep'] != None):
s = '\\setlength{\\itemsep}{' + self.options['bibitemsep'] + '}\n'
filehandle.write(s.encode('utf-8'))
if ('preamble' in self.bibdata):
filehandle.write(self.bibdata['preamble'].encode('utf-8'))
filehandle.write('\n\n'.encode('utf-8'))
## Define a list which contains the citation keys, sorted in the order in which we need for
## writing into the BBL file.
self.create_citation_list()
## Write out each individual bibliography entry. Some formatting options will actually cause
## the entry to be deleted, so we need the check below to see if the return string is empty
## before writing it to the file.
for c in self.citelist:
## Verbose output is for debugging.
if verbose: print('Writing entry "' + c + '" to "' + filename + '" ...')
self.insert_crossref_data(c)
self.create_namelist(c, 'author')
self.create_namelist(c, 'editor')
## Before inserting entries into the BBL file, we do "difficult" BIB parsing jobs
## here: insert cross-reference data, format author and editor name lists, generate
## the "edition_ordinal", etc. Doing it here means that we don't have to add lots of
## extra checks later, allowing for simpler code.
if (c in self.bibdata):
self.bibdata[c]['edition_ordinal'] = create_edition_ordinal(self.bibdata[c], c, self.options)
if (c in self.bibdata) and ('pages' in self.bibdata[c]):
(startpage,endpage) = parse_pagerange(self.bibdata[c]['pages'], c)
self.bibdata[c]['startpage'] = startpage
self.bibdata[c]['endpage'] = endpage
## The "month" is stored in the bibdata dictionary as a string representing an integer
## from 1 to 12. Here we need to translate it to a string name.
if (c in self.bibdata) and ('month' in self.bibdata[c]):
if self.bibdata[c]['month'].isdigit():
monthname = monthname_dict[self.bibdata[c]['month']]
else:
monthname = self.bibdata[c]['month']
self.bibdata[c]['monthname'] = monthname
s = self.format_bibitem(c)
if (s != ''):
## Need two line EOL's here and not one so that backrefs can work properly.
filehandle.write((s + '\n').encode('utf-8'))
if write_postamble:
filehandle.write('\n\\end{thebibliography}\n'.encode('utf-8'))
filehandle.close()
return
## ===================================
[docs] def create_citation_list(self):
'''
Create the list of citation keys, sorted into the proper order.
Returns
-------
citelist : list
The sorted list of citation keys.
'''
self.citelist = []
## The "citelist" consists of tuples containing the new sort key and the original citation
## key.
for c in self.citedict:
sortkey = self.generate_sortkey(c)
self.citelist.append((sortkey,c))
## Now that we have the sortkeys, sort the citation list in numerical order. (Since each
## element of citelist is a tuple whose first element is an integer, we need not use
## locale-dependent sorting here.)
self.citelist.sort()
## If using a citation order which is descending rather than ascending, then reverse the list.
if (self.options['citation_order'] == 'ydnt'):
self.citelist = self.citelist[::-1]
## If any duplicate sorting keys exist, append a label onto the end in order to make them
## unique. It doesn't matter what you append, just so long as it makes the key unique, so we
## use the entry's index in the list of keys.
#sortkeys = [key for (key,val) in self.citelist]
#sortkeys = [x+str(i) for i,x in enumerate(sortkeys) if sortkeys.count(x)>1]
#citekeys = [val for (key,val) in self.citelist]
#citekeys = [x+str(i) for i,x in enumerate(citekeys) if citekeys.count(x)>1]
## Finally, now that we have them in the order we want, we keep only the citation keys, so
## that we know which entry maps to which in the *.aux file.
self.citelist = [b for (a,b) in self.citelist]
return
## =============================
[docs] def generate_sortkey(self, citekey):
'''
From a bibliography entry and the formatting template options, generate a sorting key for the
entry.
Parameters
----------
citekey : str
The key for the current entry.
Returns
-------
sortkey : str
A string to use as a sorting key.
'''
## When a given citation key is not found in the database, return a warning. However, if the
## citeorder if just "citekey" then we *already* know how to sort it, so rather than return a
## warning go ahead and return the citekey so that it gets sorted properly. The fact that the
## key is not in the database will raise an error later.
citeorder = self.options['citation_order']
if (citeorder == 'citekey'):
return(citekey)
if citekey not in self.bibdata:
return('Warning: "' + citekey + '" is not in the bibliography database.')
bibentry = self.bibdata[citekey]
if ('sortkey' in bibentry):
return(bibentry['sortkey'])
if (citeorder in ('citenum','citenumber')):
return(self.citedict[citekey])
namelist = []
## Use the name abbreviation (if it exists) in the sortkey.
if ('sortname' in bibentry) and ('nameabbrev' in bibentry['sortname']):
nameabbrev_dict = parse_nameabbrev(bibentry['sortname']['nameabbrev'])
elif ('nameabbrev' in bibentry[citekey]):
nameabbrev_dict = parse_nameabbrev(bibentry[citekey]['nameabbrev'])
else:
nameabbrev_dict = None
## Extract the last name of the first author.
if ('sortname' in bibentry):
namelist = namefield_to_namelist(bibentry['sortname'], key=citekey,
nameabbrev=nameabbrev_dict)
name = namelist[0]['last']
if ('first' in namelist[0]): name += namelist[0]['first']
if ('middle' in namelist[0]): name += namelist[0]['middle']
else:
if ('author' in bibentry):
self.create_namelist(citekey, 'author')
namelist = self.bibdata[citekey]['authorlist']
name = namelist[0]['last']
if self.options['sort_with_prefix'] and ('prefix' in namelist[0]):
name = namelist[0]['prefix'] + name
if ('first' in namelist[0]): name += namelist[0]['first']
if ('middle' in namelist[0]): name += namelist[0]['middle']
elif ('editor' in bibentry):
self.create_namelist(citekey, 'editor')
namelist = self.bibdata[citekey]['editorlist']
name = namelist[0]['last']
if self.options['sort_with_prefix'] and ('prefix' in namelist[0]):
name = namelist[0]['prefix'] + name
if ('first' in namelist[0]): name += namelist[0]['first']
if ('middle' in namelist[0]): name += namelist[0]['middle']
elif ('organization' in bibentry):
name = bibentry['organization']
elif ('institution' in bibentry):
name = bibentry['institution']
else:
name = ''
## Names that have initials will have unwanted '.'s in them.
name = name.replace('.','')
if ('sortyear' in bibentry):
year = str(bibentry['sortyear'])
else:
year = '9999' if ('year' not in bibentry) else str(bibentry['year'])
## "presort" is a string appended to the beginning of each sortkey. This can be useful for
## grouping entries.
presort = '' if ('presort' not in bibentry) else str(bibentry['presort'])
## "sorttitle" is an alternative title used for sorting. Can be useful if the title contains
## special characters and LaTeX commands.
if ('sorttitle' in bibentry):
title = bibentry['sorttitle']
else:
title = '' if ('title' not in bibentry) else bibentry['title']
volume = '0' if ('volume' not in bibentry) else str(bibentry['volume'])
## The different formatting options for the citation order are "nty"/"plain", "nyt", "nyvt",
## "anyt", "anyvt", ynt", and "ydnt".
if (citeorder in ('nyt','plain')):
sortkey = presort + name + year + title
elif (citeorder == 'nty'):
sortkey = presort + name + title + year
elif (citeorder == 'nyvt'):
sortkey = presort + name + year + volume + title
elif (citeorder in ('ynt','ydnt')):
sortkey = presort + year + name + title
elif (citeorder == 'alpha'):
if (len(namelist) == 1):
name = name[0:3]
elif (len(namelist) > 1):
if namelist:
name = ''.join([name['last'][0] for name in namelist])
else:
name = name[0:3]
sortkey = presort + name[0:3] + year[-2:]
elif (citeorder == 'anyt'):
alpha = '' if ('alphalabel' not in self.citedict[citekey]) else self.citedict[citekey]['alphalabel']
sortkey = presort + alpha + name + year + title
elif (citeorder == 'anyvt'):
alpha = '' if ('alphalabel' not in self.citedict[citekey]) else self.citedict[citekey]['alphalabel']
sortkey = presort + alpha + name + year + volume + title
#elif (citeorder in ('citenumber','citenum','none')):
# raise ValueError('Wrong citation sort order option. How did this happen?')
else:
raise KeyError('That citation sort order ("' + citeorder + '") is not supported.')
sortkey = purify_string(sortkey)
if not self.options['sort_case']:
sortkey = sortkey.lower()
return(sortkey)
## =============================
[docs] def create_namelist(self, key, nametype):
'''
Deconstruct the bibfile string following "author = ..." (or "editor = ..."), and create a
new field `authorlist` or `editorlist` that is a list of dictionaries (one dict for each
person).
Parameters
----------
key : str
The key in `bibdata` defining the current entry being formatted.
nametype : str, {'author','editor'}
Which bibliography field to use for parsing names.
'''
## First check that the entrykey exists.
if (key not in self.bibdata):
return
## If the name field does not exist, we can't define name dictionaries, so exit. Note that
## we need not check for cross-reference data, since we will assume that has already been
## done.
if (nametype not in self.bibdata[key]):
return
## If the namelist is already defined, then there is nothing to do!
if (nametype+'list' in self.bibdata[key]):
return
## The name abbreviations are assumed to be used for generating initials, and so skip
## using abbrevations when not initializing.
if (self.options['use_firstname_initials']) and ('nameabbrev' in self.bibdata[key]):
nameabbrev_dict = parse_nameabbrev(self.bibdata[key]['nameabbrev'])
else:
nameabbrev_dict = None
## Convert the raw string containing author/editor names in the BibTeX field into a list
## of dictionaries --- one dict for each person.
namelist = namefield_to_namelist(self.bibdata[key][nametype], key=key,
nameabbrev=nameabbrev_dict)
self.bibdata[key][nametype+'list'] = namelist
return
## =============================
[docs] def insert_crossref_data(self, entrykey, fieldname=None):
'''
Insert crossref info into a bibliography database dictionary.
Loop through a bibliography database dictionary and, for each entry which has a "crossref"
field, locate the crossref entry and insert any missing bibliographic information into the
main entry's fields.
Parameters
----------
entrykey : str
The key of the bibliography entry to query.
fieldname : str, optional
The name of the field to check. If fieldname==None, then check all fields.
Returns
-------
foundit : bool
Whether the function found a crossref for the queried field. If multiple fieldnames \
were input, then foundit will be True if a crossref is located for any one of them.
'''
## First check that the entrykey exists.
if (entrykey not in self.bibdata):
return(False)
bibentry = self.bibdata[entrykey]
if ('crossref' not in self.bibdata[entrykey]):
return(False)
if (fieldname == None):
fieldnames = bibentry.keys()
else:
if isinstance(fieldname, list):
fieldnames = fieldname
else:
fieldnames = [fieldname]
foundit = False
for field in fieldnames:
if (field not in bibentry):
continue
## Check that the crossreferenced entry actually exists. If not, then just move on.
if (self.bibdata[entrykey]['crossref'] in self.bibdata):
crossref_keys = self.bibdata[self.bibdata[entrykey]['crossref']]
else:
print('Warning: bad cross reference. Entry "' + entrykey + '" refers to entry "' + \
self.bibdata[entrykey]['crossref'] + '", which doesn\'t exist.')
continue
for k in crossref_keys:
if (k not in self.bibdata[entrykey]):
self.bibdata[entrykey][k] = self.bibdata[self.bibdata[entrykey]['crossref']][k]
foundit = True
## What a "booktitle" is in the entry is normally a "title" in the crossref.
if ('title' in crossref_keys) and ('booktitle' not in self.bibdata[entrykey]):
self.bibdata[entrykey]['booktitle'] = self.bibdata[self.bibdata[entrykey]['crossref']]['title']
foundit = True
return(foundit)
## =============================
[docs]def get_bibfilenames(filename, debug=False):
'''
If the input is a filename ending in '.aux', then read through the .aux file and locate the
lines `\bibdata{...}` and `\bibstyle{...}` to get the filename(s) for the bibliography database
and style template.
If the input is a list of filenames, then assume that this is the complete list of files to
use.
Parameters
----------
filename : str
The "auxiliary" file, containing citation info, TOC info, etc.
Returns
-------
filedict : dict
A dictionary with keys 'bib' and 'bst', each entry of which contains a list of filenames.
'''
bibfiles = []
bstfiles = []
auxfile = ''
bblfile = ''
texfile = ''
if isinstance(filename, basestring) and filename.endswith('.aux'):
auxfile = os.path.normpath(filename)
path = os.path.dirname(filename) + '/'
s = open(filename, 'rU')
for line in s.readlines():
line = line.strip()
if line.startswith('%'): continue
if line.startswith('\\bibdata{'):
line = line[9:]
indx = line.index('}')
bibres = line[:indx]
if line.startswith('\\bibstyle{'):
line = line[10:]
indx = line.index('}')
bstres = line[:indx]
## Now we have the strings from the *.tex file that describing the bibliography filenames.
## If these are lists of filenames, then split them out.
if (',' in bibres):
bibres = bibres.split(',') ## if a list of files, then split up the list pieces
else:
bibres = [bibres]
for r in bibres:
## If the filename is missing the extension, then add it.
if not r.endswith('.bib'):
r += '.bib'
## If the filename has a relative address, convert it to an absolute one.
if r.startswith('./') or r.startswith('.\\'):
r = path + r[2:]
elif r.startswith('/'):
## Linux absolute paths begin with a forward slash
pass
elif (r[0].isalpha() and r[1] == ':'):
## Windows absolute paths begin with a drive letter and a colon.
pass
else:
r = path + r
bibfiles.append(r)
## Next do the same thing for the *.bst files.
if (',' in bstres):
bstres = bstres.split(',') ## if a list of files, then split up the list pieces
else:
bstres = [bstres]
for r in bstres:
## If the filename is missing the extension, then add it.
if not r.endswith('.bst'):
r += '.bst'
## If the filename has a relative address, convert it to an absolute one.
if r.startswith('./') or r.startswith('.\\'):
r = path + r[2:]
elif r.startswith('/'):
## Linux absolute paths begin with a forward slash
pass
elif (r[0].isalpha() and r[1] == ':'):
## Windows absolute paths begin with a drive letter and a colon.
pass
else:
r = path + r
bstfiles.append(r)
## Finally, normalize the file strings to work on any platform.
for i in range(len(bibfiles)):
bibfiles[i] = os.path.normpath(bibfiles[i])
for i in range(len(bstfiles)):
bstfiles[i] = os.path.normpath(bstfiles[i])
## Or if the input is only a BIB file, then go off of that.
elif isinstance(filename, basestring) and filename.endswith('.bib'):
bibfiles = [os.path.normpath(filename)]
## Or of the input is a list, then we can go through all the filenames in the list.
elif isinstance(filename, (list, tuple)):
## All the work above was to locate the filenames from a single AUX file. However, if the
## input is a list of filenames, then constructing the filename dictionary is simple.
for f in filename:
if f.endswith('.aux'): auxfile = os.path.normpath(f)
elif f.endswith('.bib'): bibfiles.append(os.path.normpath(f))
elif f.endswith('.bst'): bstfiles.append(os.path.normpath(f))
elif f.endswith('.bbl'): bblfile = os.path.normpath(f)
elif f.endswith('.tex'): texfile = os.path.normpath(f)
if not bblfile:
bblfile = auxfile[:-4] + '.bbl'
if not texfile and auxfile:
texfile = auxfile[:-4] + '.tex'
## Now that we have the filenames, build the dictionary of BibTeX-related files.
filedict = {}
filedict['bib'] = bibfiles
filedict['bst'] = bstfiles
filedict['tex'] = texfile
filedict['aux'] = auxfile
filedict['bbl'] = bblfile
if debug:
print('bib files: ' + repr(bibfiles))
print('bst files: ' + repr(bstfiles))
print('tex file: "' + str(texfile) + '"')
print('aux file: "' + str(auxfile) + '"')
print('bbl file: "' + str(bblfile) + '"')
return(filedict)
## ===================================
[docs]def sentence_case(s):
'''
Reduce the case of the string to lower case, except for the first character in the string, and
except if any given character is at nonzero brace level.
Parameters
----------
s : str
The string to be modified.
Returns
-------
t : str
The resulting modified string.
'''
if ('{' not in s):
return(s.lower().capitalize())
## Next we look for LaTeX commands, given by a backslash followed by a non-word character.
## Do not reduce those characters in the LaTeX command to lower case. To facilitate that, add
## one to the brace level for each of those character positions.
brlevel = get_delim_levels(s)
for match in re.finditer(r'\\\w+', s, re.UNICODE):
(start,stop) = match.span()
brlevel[start:stop] = [level+1 for level in brlevel[start:stop]]
## Convert the string to a character list for easy in-place modification.
charlist = list(s)
for i,c in enumerate(charlist):
if (brlevel[i] == 0):
charlist[i] = c.lower()
charlist[0] = charlist[0].upper()
t = ''.join(charlist)
return(t)
## ===================================
[docs]def stringsplit(s, sep=r' |(?<!\\)~'):
'''
Split a string into tokens, taking care not to allow the separator to act unless at brace
level zero.
Parameters
----------
s : str
The string to split.
Returns
-------
tokens : list of str
The list of tokens.
'''
## The item separating the name tokens can be either a space character or a tilde, if the
## tilde is not preceded by a backslash (in which case it instead represents an accent
## character and not a separator).
if ('{' not in s):
tokens = re.split(sep, s)
else:
z = get_delim_levels(s)
indices = []
for match in re.finditer(sep, s):
(i,j) = match.span()
if (z[i] == 0):
## Record the indices of the start and end of the match.
indices.append((i,j))
ntokens = len(indices)
tokens = []
if (ntokens == 0):
tokens.append(s)
if (ntokens > 0) and (indices[0][0] > 0):
tokens.append(s[:indices[0][0]])
## Go through each match's indices and split the string at each.
for n in xrange(ntokens):
if (n == ntokens-1):
j = indices[n][1] ## the end of *this* separator
#print('n,j=', n, j)
tokens.append(s[j:])
else:
nexti = indices[n+1][0] ## the beginning of the *next* separator
j = indices[n][1] ## the end of *this* separator
#print('n,j,nexti=', n, j, nexti)
tokens.append(s[j:nexti])
return(tokens)
## ===================================
[docs]def finditer(a_str, sub):
'''
A generator replacement for re.finditer() but without using regex expressions.
Parameters
----------
a_str : str
The string to query.
sub : str
The substring to match to.
Yields
------
start : int
The start index of the match
'''
start = 0
while True:
start = a_str.find(sub, start)
if start == -1: return
yield start
start += 1
## =============================
[docs]def namefield_to_namelist(namefield, key=None, nameabbrev=None):
'''
Parse a name field ("author" or "editor") of a BibTeX entry into a list of dicts, one for
each person.
Parameters
----------
namefield : str
Either the "author" field of a database entry or the "editor" field.
key : str
The bibliography data entry key.
nameabbrev : dict
The dictionary for translating names to their abbreviated forms.
Returns
-------
namelist : list
A list of dictionaries, with one dictionary for each name. Each dict has keys "first",
"middle", "prefix", "last", and "suffix". The "last" key is the only one that is required.
'''
namefield = namefield.strip()
namelist = []
## Look for common typos.
if re.search('\sand,\s', namefield, re.UNICODE):
print('Warning: The name string in entry "' + str(key) + '" has " and, ", which is likely '
'a typo. Continuing on anyway ...')
if re.search(', and', namefield, re.UNICODE):
print('Warning: The name string in entry "' + str(key) + '" has ", and", which is likely '
'a typo. Continuing on anyway ...')
if (nameabbrev != None):
for key in nameabbrev:
if key in namefield: namefield = namefield.replace(key, nameabbrev[key])
## Split the string of names into individual strings, one for each complete name. Here we
## can split on whitespace surround the word "and" so that "{and}" and "word~and~word" will
## not allow the split. Need to treat the case of a single author separate from that of
## multiple authors in order to return a single-element *list* rather than a scalar.
if not re.search('\sand\s', namefield, re.UNICODE):
namedict = namestr_to_namedict(namefield)
namelist.append(namedict)
else:
if '{' not in namefield:
names = re.split('\sand\s', namefield)
else:
## If there are braces in the string, then we need to be careful to only allow
## splitting of the names when ' and ' is at brace level 0. This requires replacing
## re.split() with a bunch of low-level code.
z = get_delim_levels(namefield, ('{','}'))
separators = []
for match in re.finditer(r'\sand\s', namefield):
(i,j) = match.span()
if (z[i] == 0):
## Record the indices of the start and end of the match.
separators.append((i,j))
num_names = len(separators)
names = []
if (num_names == 0):
names.append(namefield)
if (num_names > 0) and (separators[0][0] > 0):
names.append(namefield[:separators[0][0]])
## Go through each match's indices and split the string at each.
for n in xrange(num_names):
if (n == num_names-1):
j = separators[n][1] ## the end of *this* separator
#print('n,j=', n, j)
names.append(namefield[j:])
else:
nexti = separators[n+1][0] ## the beginning of the *next* separator
j = separators[n][1] ## the end of *this* separator
#print('n,j,nexti=', n, j, nexti)
names.append(namefield[j:nexti])
nauthors = len(names)
for i in range(nauthors):
namedict = namestr_to_namedict(names[i])
namelist.append(namedict)
return(namelist)
## =============================
[docs]def initialize_name(name, options=None, debug=False):
'''
From an input name element (first, middle, prefix, last, or suffix) , convert it to its
initials.
Parameters
----------
name : str
The name element to be converted.
options : dict, optional
Includes formatting options such as
'use_name_ties': Whether to use '~' instead of spaces to tie together author initials.
'terse_inits': Whether to format initialized author names like "RMA Azzam" instead of \
the default form "R. M. A. Azzam".
'french_intials': Whether to initialize digraphs with two letters instead of the default \
of one. For example, if use_french_initials==True, then "Christian" -> "Ch.", not "C.".
'period_after_initial': Whether to include a '.' after the author initial.
Returns
-------
new_name : str
The name element converted to its initials form.
'''
if (name == ''):
return(name)
if (options == None):
options = {}
options['period_after_initial'] = True
options['french_initials'] = False
options['terse_inits'] = False
## Go through the author's name elements and truncate the first and middle names to their
## initials. If using French initials, then a hyphenated name produces hyphenated initials
## (for example "Jean-Paul" -> "J.-P." and not "J."), and a name beginning with a digraph
## produces a digraph initial, so that Philippe -> Ph. (not P.), and Christian -> Ch. (not C.).
digraphs = ('Ch','Gn','Ll','Ph','Ss','Ch','Ph','Th')
period = '.' if options['period_after_initial'] else ''
## Split the name by spaces (primarily used for middle name strings, which may have multiple
## names in them), and remove any empty list elements produced by the split.
tokens = list(name.split(' '))
tokens = [x for x in tokens if x]
for j,nametoken in enumerate(tokens):
if nametoken.startswith('{') and nametoken.endswith('}'): continue
if ('-' in nametoken):
## If the name is hyphenated, then initialize the hyphenated parts of it, and assemble
## the result by combining initials with the hyphens. That is, "Chein-Ing" -> "C.-I.".
pieces = nametoken.split('-')
tokens[j] = '-'.join([initialize_name(p, options) for p in pieces])
else:
## If the token already ends in a period then it's might already be an initial, but
## only if it has length 2. Otherwise you still need to truncate it.
## If the name only has one character in it, then treat it as a full name that doesn't
## need initializing.
if nametoken.endswith('.') and (len(nametoken) == 2):
tokens[j] = nametoken[0] + period
elif (len(nametoken) > 1):
if options['french_initials'] and (nametoken[:2] in digraphs):
tokens[j] = nametoken[:2] + period
else:
tokens[j] = nametoken[0] + period
else:
tokens[j] = nametoken
if options['terse_inits']:
newname = ''.join(tokens).replace(period,'')
else:
newname = ' '.join(tokens)
if debug: print('Initializer input [' + name + '] --> output [' + newname + ']')
return(newname)
## ===================================
[docs]def get_delim_levels(s, delims=('{','}'), operator=None):
'''
Generate a list of level numbers for each character in a string.
Parameters
----------
s : str
The string to characterize.
ldelim : str
The left-hand-side delimiter.
rdelim : str
The right-hand-side delimiter.
is_regex : bool
Whether the delimiters are expressed as regular expressions or as simple strings.
Returns
-------
oplevels : list of ints
A list giving the operator delimiter level (or "brace level" if no operator is given) of \
each character in the string.
'''
stack = []
oplevels = [0]*len(s) ## operator level
brlevels = [0]*len(s) ## brace level
## Locate the left-delimiters and increment the level each time we find one.
if (operator != None) and (s.count(operator) < 1):
return(oplevels)
for j,c in enumerate(s):
if (c == delims[0]):
if (operator != None) and (s[j-len(operator):j] == operator):
stack.append('o') ## add an "operator" marker to the stack
else:
stack.append('b') ## add a "brace level" marker to the stack
elif (c == delims[1]):
stack.pop()
if (operator != None): oplevels[j] = stack.count('o')
brlevels[j] = stack.count('b')
if (operator != None):
return(oplevels)
else:
return(brlevels)
## ===================================
[docs]def show_levels_debug(s, levels):
'''
A debugging tool for showing delimiter levels and the input string next to one another.
'''
q = 0 ## counter for the character ending a line
if ('\n' in s):
for line in s.split('\n'):
print(line)
print(str(levels[q:q+len(line)])[2:-1].replace(',','').replace(' ',''))
q += len(line)
else:
print(s)
print(str(levels)[2:-1].replace(',','').replace(' ',''))
return
## ===================================
[docs]def get_quote_levels(s, debug=False):
'''
Return a list which gives the "quotation level" of each character in the string.
Parameters
----------
s : str
The string to analyze.
Returns
-------
alevels : list
The double-quote-level for (``,'') pairs in the string.
blevels : list
The single-quote-level for (`,') pairs in the string.
clevels : list
The neutral-quote-level for (",") pairs in the string.
Notes
-----
When using double-quotes, it is easy to break the parser, so they should be used only sparingly.
'''
stack = []
alevels = [0]*len(s) ## double-quote level
blevels = [0]*len(s) ## single-quote level
clevels = [0]*len(s) ## neutral-quote level
for j,ch in enumerate(s):
if (ch == '`') and (s[j-1] != '\\'):
## Note: the '\\' case here is for detecting a grave accent markup.
if (s[j:j+2] == '``'):
stack.append('a') ## add a double-quote marker to the stack
elif (s[j-1] != '`') and (s[j-1].isspace() or (s[j-1] in '()[]{}":;,<>') or (j == 0)):
## The trouble with single-quotes is the clash with apostrophes. So, only increment
## the single-quote counter if we think it is the start of a quote (not immediately
## preceded by a non-whitespace character).
stack.append('b') ## add a single-quote marker to the stack
elif (ch == "'") and (alevels[j-1]+blevels[j-1]+clevels[j-1] > 0) and (s[j-1] != '\\'):
## Note: the '\\' case here is for detecting an accent markup.
if (s[j:j+2] == "''"):
stack.pop() ## remove double-quote marker from the stack
elif (s[j-1] != "'"):
stack.pop() ## remove single-quote marker from the stack
elif (ch == '"') and (s[j-1] != '\\'):
## Note: the '\\' here case is for not detecting an umlaut markup.
if (len(stack) > 0) and (stack[-1] == 'c'):
stack.pop()
else:
stack.append('c')
alevels[j] = stack.count('a')
blevels[j] = stack.count('b')
clevels[j] = stack.count('c')
if (alevels[-1] > 0):
print('Warning: found mismatched "``"..."''" quote pairs in the input string "' + s + \
'". Ignoring the problem and continuing on ...')
alevels[-1] = 0
if (blevels[-1] > 0):
print('Warning: found mismatched "`"..."\'" quote pairs in the input string "' + s + \
'". Ignoring the problem and continuing on ...')
blevels[-1] = 0
if (clevels[-1] > 0):
print('Warning: found mismatched '"'...'"' quote pairs in the input string "' + s + \
'". Ignoring the problem and continuing on ...')
clevels[-1] = 0
if debug:
show_levels_debug(s, alevels)
show_levels_debug(s, blevels)
show_levels_debug(s, clevels)
return(alevels, blevels, clevels)
## ===================================
[docs]def splitat(s, ilist):
'''
Split a string at locations given by a list of indices.
This can be used more flexibly than Python's native string split() function, when the character
you are splitting on is not always a valid splitting location.
Parameters
----------
s : str
The string to split.
ilist : list
The list of string index locations at which to split the string.
Returns
-------
slist : list of str
The list of split substrings.
Example
-------
s = splitat('here.but.not.here', [4,8])
>>> ['here', 'but', 'not.here']
'''
numsplit = len(ilist)
slist = []
ilist.sort()
## If the function is being asked to "split" at the first character, then just truncate.
if (ilist[0] == 0):
s = s[1:]
ilist = ilist[1:]
numsplit -= 1
## If the function is being asked to "split" at the last character, then just truncate.
if (ilist[-1] == (len(s)-1)):
s = s[:-1]
ilist = ilist[:-1]
numsplit -= 1
## If truncation has removed all of the "splitting" locations from the list, then return.
if (numsplit == 0):
return(s, '')
start = 0
end = len(s)
slist.append(s[:ilist[0]])
if (len(ilist) == 1):
slist.append(s[ilist[0]+1:])
elif (len(ilist) > 1):
for n in range(numsplit-1):
start = ilist[n] + 1
end = ilist[n+1]
#print(n, start, end)
slist.append(s[start:end])
slist.append(s[ilist[-1]+1:])
return(slist)
## =============================
[docs]def multisplit(s, seps):
'''
Split a string using more than one separator.
Copied from http://stackoverflow.com/questions/1059559/python-strings-split-with-multiple-separators.
Parameters
----------
s : str
The string to split.
sep : list of str
The list of separators.
Returns
-------
res : list
The list of split substrings.
'''
res = [s]
for sep in seps:
(s, res) = (res, [])
for seq in s:
res += seq.split(sep)
return(res)
## ===================================
[docs]def enwrap_nested_string(s, delims=('{','}'), odd_operator=r'\textbf', even_operator=r'\textrm'):
'''
This function will return the input string if it finds there
are no nested operators inside (i.e. when the number of delimiters found is < 2).
Parameters
----------
s : str
The string whose nested operators are to be modified.
delims : tuple of two strings
The left and right delimiters for all matches.
odd_operator : str
The nested operator (applied to the left of the delimiter) currently used within string "s".
even_operator : str
The operator used to replace the currently used one for all even nesting levels.
Returns
-------
s : str
The modified string.
'''
if not (odd_operator in s):
return(s)
oplevels = get_delim_levels(s, delims, odd_operator)
if (oplevels[-1] > 0):
print('Warning: found mismatched "{","}" brace pairs in the input string. '
'Ignoring the problem and continuing on ...')
return(s)
## In the operator queue, we replace all even-numbered levels while leaving all odd-numbered
## levels alone. Recall that "oplevels" encodes the position of the *delimiter* and not the
## position of where the operator starts.
maxlevels = max(oplevels)
if (maxlevels < 2):
return(s)
## Select only even levels starting at 2. "q" is a counter for the number of substr
## replacements.
q = 0
shift = len(odd_operator) - len(even_operator)
for i,level in enumerate(oplevels):
if (level > 0) and (level % 2 == 0):
## Any place we see a transition from one level lower to this level is a substr we want
## to replace. The "shift" used here is to compensate for changes in the length of the
## string due to differences in length between the input operator and output.
if (oplevels[i-1] < level):
#print(i, level, oplevels[i-1], s, s[:i-len(odd_operator)], s[i:])
s = s[:i-len(odd_operator)-(q*shift)] + even_operator + s[i-(q*shift):]
q += 1
return(s)
## ===================================
[docs]def enwrap_nested_quotes(s, debug=False):
'''
Find nested quotes within strings and, if necessary, replace them with the proper nesting
(i.e. outer quotes use ``...'' while inner quotes use `...').
Parameters
----------
s : str
The string to modify.
Returns
-------
s : str
The new string in which nested quotes have been properly reformatted.
'''
## NOTE: There is quite a lot going on in this function, so it may help future development
## to write some of this work into subroutines.
## First check for cases where parsing is going to be very complicated. For now, we should just
## flag these cases to inform the user that they need to modify the source to tell the parser
## what they want to do.
if ("```" in s) or ("'''" in s):
print('Warning: the input string ["' + s + '"] contains multiple unseparated quote characters. '
'Bibulous cannot unnest the single and double quotes from this set, so the '
'separate quotations must be physically separated like ``{\:}`, for example. '
'Ignoring the quotation marks and continuing ...')
return(s)
## Note that a backtick preceded by a backslash, an explamation point, or a question mark,
## indicates LaTeX markup for a grave accent, an inverted explamation point, and an inverted
## question mark, respectively.
#adelims = ('``',"''")
anum = len(re.findall(r'(?<!\!\?\\)``', s))
#bdelims = ('`',"'")
bnum = len(re.findall(r'(?<!\!\?\\)`', s))
#cdelim = '"'
cnum = len(re.findall(r'(?<!\\)"', s))
if ((anum + bnum + cnum) <= 1) and (r'\enquote{' not in s):
return(s)
## Get the lists describing the quote nesting.
(alevels, blevels, clevels) = get_quote_levels(s)
## First, we look at the quote stack and replace *all* quote pairs with \enquote{...}. When
## done, we can use `enwrap_nested_string()` to replace the odd and even instances of
## \enquote{} with different quotation markers, depending on locale. `q` is the amount of
## shift between indices in the current string and those in the source string.
q = 0
for i in range(len(s)):
if (i == 0) and (alevels[0] == 1):
## This is a transition up in level. Add "\enquote{".
s = s[:i+q] + r'\enquote{' + s[i+2+q:]
q += len(r'\enquote{') - 2
elif (i > 0) and (alevels[i-1] < alevels[i]):
## This is a transition up in level. Add "\enquote{".
s = s[:i+q] + r'\enquote{' + s[i+2+q:]
q += len(r'\enquote{') - 2
elif (i > 0) and (alevels[i-1] > alevels[i]):
## This is a transition down in level. Add "}".
s = s[:i+q] + '}' + s[i+2+q:]
q -= 1
## Do the same thing for single quotes.
if (i == 0) and (blevels[0] == 1):
## This is a transition up in level. Add "\enquote{".
s = s[:i+q] + r'\enquote{' + s[i+1+q:]
q += len(r'\enquote{') - 1
elif (i > 0) and (blevels[i-1] < blevels[i]):
## This is a transition up in level. Add "\enquote{".
s = s[:i+q] + r'\enquote{' + s[i+1+q:]
q += len(r'\enquote{') - 1
elif (i > 0) and (blevels[i-1] > blevels[i]):
## This is a transition down in level. Add "}".
s = s[:i+q] + '}' + s[i+1+q:]
## Do the same thing for neutral quotes.
if (i == 0) and (clevels[0] == 1):
## This is a transition up in level. Add "\enquote{".
s = s[:i+q] + r'\enquote{' + s[i+1+q:]
q += len(r'\enquote{') - 1
elif (i > 0) and (clevels[i-1] < clevels[i]):
## This is a transition up in level. Add "\enquote{".
s = s[:i+q] + r'\enquote{' + s[i+1+q:]
q += len(r'\enquote{') - 1
elif (i > 0) and (clevels[i-1] > clevels[i]):
## This is a transition down in level. Add "}".
s = s[:i+q] + '}' + s[i+1+q:]
## Next, we determine the nesting levels of quotations.
qlevels = get_delim_levels(s, ('{','}'), r'\enquote')
## Finally, replace odd levels of quotation with "``" and even levels with "`".
## TODO: This approach is the American convention. Need to generalize this behavior for British
## convention, and, even more broadly, to all locale-dependent quotation mark behavior. For
## now let's just try to get the American version working.
t = 0
odd_operators = ("``","''")
even_operators = ("`","'")
if debug:
show_levels_debug(s, qlevels)
for i,level in enumerate(qlevels):
if (qlevels[i-1] == 0) and (level == 0): continue
## Any place we see a transition from one level lower to this level is a substr we want
## to replace. The shift "t" used here is to compensate for changes in the length of the
## string due to differences in length between the input operator and output. Also,
## if we are about to substitute in a quote character (or two, in the case of LaTeX
## double quotes), then we need to be careful to look in front and see if the quotes butt
## up against one another, in which case we need to add a small space. For example, LaTeX
## will interpret ''' as a double ending quote followed by a single ending quote, even if
## we meant it to be the other way around. The space "\:" will remove the ambiguity.
## There is quite a lot going on in this loop, so it may help future work to replace some
## of the operations here with subroutines.
if (level % 2 == 0):
if (i < len(r'\enquote{')-1) or (qlevels[i-1] < level):
left = i + t - len(r'\enquote{') + 1
right = i + t + 1
if (i > len(odd_operators[0])):
needs_space = (s[left-len(odd_operators[0]):left] == odd_operators[0])
else:
needs_space = False
if needs_space:
newstr = r'\:' + even_operators[0]
else:
newstr = even_operators[0]
s = s[:left] + newstr + s[right:]
t += len(newstr) - len(r'\enquote{')
elif (qlevels[i-1] > level):
left = i + t - len('}') + 1
right = i + t + 1
if (i > len(even_operators[1])):
needs_space = (s[left-len(even_operators[1]):left] == even_operators[1])
else:
needs_space = False
if needs_space:
newstr = r'\:' + odd_operators[1]
else:
newstr = odd_operators[1]
s = s[:left] + newstr + s[right:]
t += len(newstr) - len('}')
else:
if (i < len('r\enquote{')-1) or (qlevels[i-1] < level):
left = i + t - len(r'\enquote{') + 1
right = i + t + 1
if (i > len(even_operators[0])):
needs_space = (s[left-len(even_operators[0]):left] == even_operators[0])
else:
needs_space = False
if needs_space:
newstr = r'\:' + odd_operators[0]
else:
newstr = odd_operators[0]
s = s[:left] + newstr + s[right:]
t += len(newstr) - len(r'\enquote{')
elif (qlevels[i-1] > level):
left = i + t - len('}') + 1
right = i + t + 1
if (i > len(odd_operators[1])):
needs_space = (s[left-len(odd_operators[1]):left] == odd_operators[1])
else:
needs_space = False
if needs_space:
newstr = r'\:' + even_operators[1]
else:
newstr = even_operators[1]
s = s[:left] + newstr + s[right:]
t += len(newstr) - len('}')
return(s)
## =============================
[docs]def purify_string(s):
'''
Remove the LaTeX-based formatting elements from a string so that a sorting function can use \
alphanumerical sorting on the string.
Parameters
----------
s : str
The string to "purify".
Returns
-------
p : str
The "purified" string.
Notes
-----
Currently `purify_string()` does not allow LaTeX markup such as \'i to refer to the Unicode
character which is correctly written as \'\i. Add functionality to allow that?
'''
p = unicode(s)
if not ('\\' in p):
## Remove braces if present.
p = p.replace('{', '')
p = p.replace('}', '')
return(p)
## Convert any LaTeX character encodings directly to their unicode equivalents.
p = latex_to_utf8(p)
## Next we look for LaTeX commands. LaTeX variables will have the form "\variable" followed
## by either whitespace, '{', or '}'. A function will have the form "\functionname{...}"
## where the ellipsis can be anything.
match = re.compile(r'\\\w+', re.UNICODE)
p = match.sub('', p)
## Finally, remove the braces used for LaTeX commands. We can't just replace '{' and '}'
## wholesale, since the syntax allows '\{' and '\}' to produce text braces. So first we
## remove the command braces, and then we swap '\{' for '{' etc at the end.
match = re.compile(r'(?<!\\){', re.UNICODE)
p = match.sub('', p, re.UNICODE)
match = re.compile(r'(?<!\\)}', re.UNICODE)
p = match.sub('', p, re.UNICODE)
p = p.replace('\\{', '{')
p = p.replace('\\}', '}')
return(p)
## =============================
## I think that we can get rid of the "u" in front of the strings now that we are using
## "from __future__ import unicode_literals".
[docs]def latex_to_utf8(s):
'''
Translate LaTeX-markup special characters to their Unicode equivalents.
Parameters
----------
s : str
The string to translate.
Returns
-------
s : str
The translated version of the input.
'''
## Note that the code below uses replace() loops rather than a translation table, since the
## latter only supports one-to-one translation, whereas something more flexible is needed
## here. There is a substantial
if ('\\' not in s):
return(s)
## First, some easy replacements.
trans = {r'\$':'$', r'\%':'%', r'\_':'_', r'\&':'&', r'\#':'#', r'!`':'¡', r'?`':'¿'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
## If there are any double backslashes, place some braces around them to prevent something like
## "\\aa" from getting interpreted as a backslash plus "\aa". This makes the string easier to
## parse and does no harm.
if r'\\' in s:
s = s.replace(r'\\',r'{\\}')
## First identify all the LaTeX special characters in the string, then replace them all with
## their Unicode equivalents. (This is to make sure that they alphabetize correctly for
## sorting.)
## The translation dictionary below uses single-letter accent commands, such as \u{g}.
## These commands are one of (r'\b', r'\c', r'\d', r'\H', r'\k', r'\r', r'\u', r'\v'), followed
## by an open brace, a single character, and a closed brace. These replacements are done first
## because some of the special characters use '\i', which would otherwise get replaced by the
## "dotless i" Unicode character, and so the replacement dictionary here would not detect
## the proper LaTeX code for it.
## First, characters with a cedilla or comma below.
if (r'\c' in s):
trans = {r'\c{C}':'Ç', r'{\c C}':'Ç', r'\c{c}':'ç', r'{\c c}':'ç',
r'\c{E}':'Ȩ', r'{\c E}':'Ȩ', r'\c{e}':'ȩ', r'{\c e}':'ȩ',
r'\c{G}':'Ģ', r'{\c G}':'Ģ', r'\c{g}':'ģ', r'{\c g}':'ģ',
r'\c{K}':'Ķ', r'{\c K}':'Ķ', r'\c{k}':'ķ', r'{\c k}':'ķ',
r'\c{L}':'Ļ', r'{\c L}':'Ļ', r'\c{l}':'ļ', r'{\c l}':'ļ',
r'\c{N}':'Ņ', r'{\c N}':'Ņ', r'\c{n}':'ņ', r'{\c n}':'ņ',
r'\c{R}':'Ŗ', r'{\c R}':'Ŗ', r'\c{r}':'ŗ', r'{\c r}':'ŗ',
r'\c{S}':'Ş', r'{\c S}':'Ş', r'\c{s}':'ş', r'{\c s}':'ş',
r'\c{T}':'Ţ', r'{\c T}':'Ţ', r'\c{t}':'ţ', r'{\c t}':'ţ'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
## Characters with breve above. Note that the "\u" is problematic when "unicode_literals" is
## turned on via "from __future__ import unicode_literals". Thus, in the block below, rather
## than raw strings with single backslashes, we have to use double-backslashes.
if ('\\u' in s):
trans = {'\\u{A}':'Ă', '{\\u A}':'Ă', '\\u{a}':'ă', '{\\u a}':'ă',
'\\u{E}':'Ĕ', '{\\u E}':'Ĕ', '\\u{e}':'ĕ', '{\\u e}':'ĕ',
'\\u{G}':'Ğ', '{\\u G}':'Ğ', '\\u{g}':'ğ', '{\\u g}':'ğ',
'\\u{I}':'Ĭ', '{\\u I}':'Ĭ', '\\u{\i}':'ĭ', '{\\u\i}':'ĭ',
'\\u{O}':'Ŏ', '{\\u O}':'Ŏ', '\\u{o}':'ŏ', '{\\u o}':'ŏ',
'\\u{U}':'Ŭ', '{\\u U}':'Ŭ', '\\u{u}':'ŭ', '{\\u u}':'ŭ'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
## Characters with an ogonek.
if (r'\k' in s):
trans = {r'\k{A}':'Ą', r'{\k A}':'Ą', r'\k{a}':'ą', r'{\k a}':'ą',
r'\k(E}':'Ę', r'{\k E}':'Ę', r'\k{e}':'ę', r'{\k e}':'ę',
r'\k{I}':'Į', r'{\k I}':'Į', r'\k{i}':'į', r'{\k i}':'į',
r'\k{O}':'Ǫ', r'{\k O}':'Ǫ', r'\k{o}':'ǫ', r'{\k o}':'ǫ',
r'\k{U}':'Ų', r'{\k U}':'Ų', r'\k{u}':'ų', r'{\k u}':'ų'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
## Characters with hachek.
if (r'\v' in s):
trans = {r'\v{C}':'Č', r'{\v C}':'Č', r'\v{c}':'č', r'{\v c}':'č',
r'\v{D}':'Ď', r'{\v D}':'Ď', r'\v{d}':'ď', r'{\v d}':'ď',
r'\v{E}':'Ě', r'{\v E}':'Ě', r'\v{e}':'ě', r'{\v e}':'ě',
r'\v{L}':'Ľ', r'{\v L}':'Ľ', r'\v{l}':'ľ', r'{\v l}':'ľ',
r'\v{N}':'Ň', r'{\v N}':'Ň', r'\v{n}':'ň', r'{\v n}':'ň',
r'\v{R}':'Ř', r'{\v R}':'Ř', r'\v{r}':'ř', r'{\v r}':'ř',
r'\v{S}':'Š', r'{\v S}':'Š', r'\v{s}':'š', r'{\v s}':'š',
r'\v{T}':'Ť', r'{\v T}':'Ť', r'\v{t}':'ť', r'{\v t}':'ť',
r'\v{Z}':'Ž', r'{\v Z}':'Ž', r'\v{z}':'ž', r'{\v z}':'ž',
r'\v{H}':'\u021E', r'{\v H}':'\u021E', r'\v{h}':'\u021F', r'{\v h}':'\u021F',
r'\v{A}':'\u01CD', r'{\v A}':'\u01CD', r'\v{a}':'\u01CE', r'{\v a}':'\u01CE',
r'\v{I}':'\u01CF', r'{\v I}':'\u01CF', r'\v{\i}':'\u01D0', r'{\v \i}':'\u01D0',
r'\v{O}':'\u01D1', r'{\v O}':'\u01D1', r'\v{o}':'\u01D2', r'{\v o}':'\u01D2',
r'\v{U}':'\u01D3', r'{\v U}':'\u01D3', r'\v{u}':'\u01D4', r'{\v u}':'\u01D4',
r'\v{G}':'\u01E6', r'{\v G}':'\u01E6', r'\v{g}':'\u01E7', r'{\v g}':'\u01E7',
r'\v{K}':'\u01E8', r'{\v K}':'\u01E8', r'\v{k}':'\u01E9', r'{\v k}':'\u01E9'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
if (r'\H' in s):
trans = {r'\H{O}':u'Ő', r'\H{o}':u'ő', r'\H{U}':u'Ű', r'\H{u}':u'ű',
r'{\H O}':u'Ő', r'{\H o}':u'ő', r'{\H U}':u'Ű', r'{\H u}':u'ű'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
## Characters with a ring above.
if (r'\r' in s):
trans = {r'\r{U}':u'Ů', r'{\r U}':u'Ů', r'\r{u}':u'ů', r'{\r u}':u'ů'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
## Now let's do the straightforward accent characters.
trans = {r'\`A':u'\u00C1', r'\`a':u'\u00E0', r"\'A":u'\u00C1', r"\'a":u'\u00E1',
r'\~A':u'\u00C3', r'\^a':u'\u00E2', r'\"A':u'\u00C4', r'\~a':u'\u00E3',
r'\"a':u'\u00E4', r'\`E':u'\u00C8', r"\'E":u'\u00C9', r'\`e':u'\u00E8',
r'\^E':u'\u00CA', r"\'e":u'\u00E9', r'\"E':u'\u00CB', r'\^e':u'\u00EA',
r'\`I':u'\u00CC', r'\"e':u'\u00EB', r"\'I":u'\u00CD', r'\`\i':u'\u00EC',
r'\^I':u'\u00CE', r"\'\i":u'\u00ED', r'\"I':u'\u00CF', r'\^\i':u'\u00EE',
r'\~N':u'\u00D1', r'\"\i':u'\u00EF', r'\`O':u'\u00D2', r'\~n':u'\u00F1',
r"\'O":u'\u00D3', r'\`o':u'\u00F2', r'\^O':u'\u00D4', r"\'o":u'\u00F3',
r'\~O':u'\u00D5', r'\^o':u'\u00F4', r'\"O':u'\u00D6', r'\~o':u'\u00F5',
r'\"o':u'\u00F6', r'\`U':u'\u00D9', r"\'U":u'\u00DA', r'\`u':u'\u00F9',
r'\^U':u'\u00DB', r"\'u":u'\u00FA', r'\"U':u'\u00DC', r'\^u':u'\u00FB',
r"\'Y":u'\u00DD', r'\"u':u'\u00FC', r"\'y":u'\u00FD', r'\"y':u'\u00FF',
r'\=A':u'\u0100', r'\=a':u'\u0101', r"\'C":u'\u0106', r"\'c":u'\u0107',
r'\^C':u'\u0108', r'\^c':u'\u0109', r'\.C':u'\u010A', r'\.c':u'\u010B',
r'\=E':u'\u0112', r'\=e':u'\u0113', r'\.E':u'\u0116', r'\.e':u'\u0117',
r'\^G':u'\u011C', r'\^g':u'\u011D', r'\.G':u'\u0120', r'\.g':u'\u0121',
r'\^H':u'\u0124', r'\^h':u'\u0125', r'\~I':u'\u0128', r'\~\i':u'\u0129',
r'\=I':u'\u012A', r'\=\i':u'\u012B', r'\.I':u'\u0130', r'\^J':u'\u0134',
r'\^\j':u'\u0135', r"\'L":u'\u0139', r"\'N":u'\u0143', r"\'n":u'\u0144',
r'\=O':u'\u014C', r'\=o':u'\u014D', r"\'R":u'\u0154', r"\'r":u'\u0155',
r"\'S":u'\u015A', r"\'s":u'\u015B', r'\~U':u'\u0168', r'\~u':u'\u0169',
r'\=U':u'\u016A', r'\=u':u'\u016B', r'\^W':u'\u0174', r'\^w':u'\u0175',
r'\^Y':u'\u0176', r'\^y':u'\u0177', r"\'Z":u'\u0179', r"\'z":u'\u017A',
r'\.Z':u'\u017B', r'\.z':u'\u017C',
r'\`Y':u'Ỳ', r'\`y':u'ỳ', r"\'K":u'Ḱ', r"\'k":u'ḱ',
r"\'l":u'ĺ', r'\^A':u'Â', r'\^S':u'Ŝ', r'\^s':u'ŝ',
r'\"Y':u'Ÿ', r'\~E':u'Ẽ', r'\~e':u'ẽ', r'\~Y':u'Ỹ',
r'\~y':u'ỹ'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
## Do again for anyone using extra braces.
trans = {r'\`{A}':u'\u00C1', r'\`{a}':u'\u00E0', r"\'{A}":u'\u00C1', r"\'{a}":u'\u00E1',
r'\~{A}':u'\u00C3', r'\^{a}':u'\u00E2', r'\"{A}':u'\u00C4', r'\~{a}':u'\u00E3',
r'\"{a}':u'\u00E4', r'\`{E}':u'\u00C8', r"\'{E}":u'\u00C9', r'\`{e}':u'\u00E8',
r'\^{E}':u'\u00CA', r"\'{e}":u'\u00E9', r'\"{E}':u'\u00CB', r'\^{e}':u'\u00EA',
r'\`{I}':u'\u00CC', r'\"{e}':u'\u00EB', r"\'{I}":u'\u00CD', r'\`{\i}':u'\u00EC',
r'\^{I}':u'\u00CE', r"\'{\i}":u'\u00ED', r'\"{I}':u'\u00CF', r'\^{\i}':u'\u00EE',
r'\~{N}':u'\u00D1', r'\"{\i}':u'\u00EF', r'\`{O}':u'\u00D2', r'\~{n}':u'\u00F1',
r"\'{O}":u'\u00D3', r'\`{o}':u'\u00F2', r'\^{O}':u'\u00D4', r"\'{o}":u'\u00F3',
r'\~{O}':u'\u00D5', r'\^{o}':u'\u00F4', r'\"{O}':u'\u00D6', r'\~{o}':u'\u00F5',
r'\"{o}':u'\u00F6', r'\`{U}':u'\u00D9', r"\'{U}":u'\u00DA', r'\`{u}':u'\u00F9',
r'\^{U}':u'\u00DB', r"\'{u}":u'\u00FA', r'\"{U}':u'\u00DC', r'\^{u}':u'\u00FB',
r"\'{Y}":u'\u00DD', r'\"{u}':u'\u00FC', r"\'{y}":u'\u00FD', r'\"{y}':u'\u00FF',
r'\={A}':u'\u0100', r'\={a}':u'\u0101', r"\'{C}":u'\u0106', r"\'{c}":u'\u0107',
r'\^{C}':u'\u0108', r'\^{c}':u'\u0109', r'\.{C}':u'\u010A', r'\.{c}':u'\u010B',
r'\={E}':u'\u0112', r'\={e}':u'\u0113', r'\.{E}':u'\u0116', r'\.{e}':u'\u0117',
r'\^{G}':u'\u011C', r'\^{g}':u'\u011D', r'\.{G}':u'\u0120', r'\.{g}':u'\u0121',
r'\^{H}':u'\u0124', r'\^{h}':u'\u0125', r'\~{I}':u'\u0128', r'\~{\i}':u'\u0129',
r'\={I}':u'\u012A', r'\={\i}':u'\u012B', r'\.{I}':u'\u0130', r'\^{J}':u'\u0134',
r'\^{\j}':u'\u0135', r"\'{L}":u'\u0139', r"\'{N}":u'\u0143', r"\'{n}":u'\u0144',
r'\={O}':u'\u014C', r'\={o}':u'\u014D', r"\'{R}":u'\u0154', r"\'{r}":u'\u0155',
r"\'{S}":u'\u015A', r"\'{s}":u'\u015B', r'\~{U}':u'\u0168', r'\~{u}':u'\u0169',
r'\={U}':u'\u016A', r'\={u}':u'\u016B', r'\^{W}':u'\u0174', r'\^{w}':u'\u0175',
r'\^{Y}':u'\u0176', r'\^{y}':u'\u0177', r"\'{Z}":u'\u0179', r"\'{z}":u'\u017A',
r'\.{Z}':u'\u017B', r'\.{z}':u'\u017C',
r'\`{Y}':u'Ỳ', r'\`{y}':u'ỳ', r"\'{K}":u'Ḱ', r"\'{k}":u'ḱ',
r"\'{l}":u'ĺ', r'\^{A}':u'Â', r'\^{S}':u'Ŝ', r'\^{s}':u'ŝ',
r'\"{Y}':u'Ÿ', r'\~{E}':u'Ẽ', r'\~{e}':u'ẽ', r'\~{Y}':u'Ỹ',
r'\~{y}':u'ỹ'}
for c in trans:
if c in s: s = s.replace(c, trans[c])
## Those were easy. Next we look for single-char codes that must be followed by a non-alpha
## element, so that for example \orange will produce the "\orange" LaTeX command while
## "{\o}range" or "\o range" will produce "ørange". Since we're using regex and not Python's
## string objects, we need to replace all backslashes with double-backslashes. Thankfully,
## no other regex escape characters occur in this list of LaTeX markup for foreign letters.
trans = {r'\AA':u'Å', r'\AE':u'Æ', r'\aa':u'å', r'\ae':u'æ', r'\O':u'Ø', r'\o':u'ø',
r'\ss':u'ß', r'\i':u'ı', r'\L':u'Ł', r'\l':u'ł', r'\OE':u'Œ', r'\oe':u'œ',
r'\j':u'ȷ', r'\P':u'¶', r'\S':u'§', r'\DH':u'Ð', r'\dh':u'ð', r'\DJ':u'Đ',
r'\dj':u'đ', r'\IJ':u'IJ', r'\ij':u'ij', r'\NG':u'Ŋ', r'\ng':u'ŋ', r'\SS':u'ẞ',
r'\TH':u'Þ', r'\th':u'þ'}
for c in trans:
match = re.compile(c.replace('\\','\\\\')+r'(?!\w)', re.UNICODE)
s = match.sub(trans[c], s, re.UNICODE)
return(s)
## =============================
[docs]def parse_bst_template_str(bst_template_str, bibentry, variables, undefstr='???'):
'''
From an "options train" `[...|...|...]`, find the first defined block in the
train.
A Bibulous type of bibliography style template string contains grammatical featues
called options trains, of the form `[...|...|...]`. Each "block" in the train (divided
from the others by a `|` symbol), contains fields which, if defined, replace the
entire options train in the returned string.
Parameters
----------
bst_template_str : str
The string containing a complete entrytype bibliography style template.
variables : list of str
The list of variables defined within the template string.
bibentry : dict
An entry from the bibliography database.
undefstr : str
The string to replace undefined required fields with.
Returns
-------
arg : str
The string giving the entrytype block to replace an options train.
Example
-------
parse_bst_template_str() is given an options train "[<title>|<booktitle>]" and begins to
look into the bibliography entry. It does not find an "title" entry but finds a "booktitle"
entry. So, the function returns "<booktitle>", thereby replacing the train with the proper
defined variable.
'''
## Divide up the format string into
block_train = bst_template_str.split('|')
nblocks = len(block_train)
## Go through the if/elseif/else train of blocks within the string one by one. In each block,
## see if there are any variables defined (using the backslash). If no variables, then the
## block is "defined" and we return that block (i.e. replacing the train with that block). If
## an argument is defined, strip its leading backslash and look to see if the variable is
## defined in the bibdata entry. If so, the variable is defined and we return the backslash
## version of the variable (later code will do the variable replacement). If the variable is
## undefined (not present in the bibdata entry) then skip to the next block. If there is no
## next block, or if the block is empty (which means undefined by definition) then set the
## result to be the "undefined string".
for i in range(nblocks):
block = block_train[i]
if (block == ''):
#print('Required block "' + block + '" in "[' + bst_template_str + ']" is undefined.')
arg = undefstr
break
## If no variable exists inside the options-block, then just copy the text inside the
## block.
if ('<' not in block):
arg = block
break
## Count how many variables there are in the block. We can't just use
## "nvars = block.count(r'\')" because that would also count the LATEX formatting functions
## used within the block. Instead, we have to loop through the block string and check each
## instance of the "variables" list and count *those* one by one.
nvars = 0
for var in variables:
nvars += block.count(var)
vars_found = 0
arg_is_defined = [False]*nvars
## Loop through the list of variables and find which ones are defined within the
## bibliography entry.
for var in variables:
if var in block:
varname = var[1:-1] ## remove the angle brackets
if (varname in bibentry):
arg_is_defined[vars_found] = True
vars_found += 1
if (vars_found == nvars): break
if all(arg_is_defined):
arg = block
break
else:
arg = ''
return(arg)
## =============================
[docs]def namestr_to_namedict(namestr):
'''
Take a BibTeX string representing a single person's name and parse it into its first, middle, \
last, etc pieces.
BibTeX allows three different styles of author formats.
(1) A space-separated list, [firstname middlenames suffix lastname]
(2) A two-element comma-separated list, [prefix lastname, firstname middlenames]
(3) a three-element comma-separated list, [prefix lastname, suffix, firstname middlenames].
So, we can separate these three categories by counting the number of commas that appear.
Parameters
----------
namestr : str
The string containing a single person's name, in BibTeX format
Returns
-------
namedict : dict
A dictionary with keys "first", "middle", "prefix", "last", and "suffix".
'''
## First, if there is a comma within the string, then build the delimiter level list so you
## can check whether the comma is at delimiter level 0 (and therefore a valid comma for
## determining name structure). Using this, determine the locations of "valid" commas.
if (',' in namestr):
z = get_delim_levels(namestr, ('{','}'))
commapos = []
for match in re.finditer(',', namestr):
i = match.start()
if (z[i] == 0): commapos.append(i)
else:
commapos = []
## Note: switch on "len(commapos)" here rather than on "(',' not in namestr)" because the
## latter will produce an error when the comma occurs at brace levels other than zero.
if (len(commapos) == 0):
## Allow nametokens to be split by spaces *or* word ties ('~' == unbreakable spaces),
## except when the '~' is preceded by a backslash, in which case we have the LaTeX markup
## for the tilde accent. We use "stringsplit()" rather than the standard
## string object's "split()" function because we don't want the split to be applied when
## the separator is not at brace level zero.
nametokens = stringsplit(namestr, r' |(?<!\\)~')
for n in nametokens:
n_temp = n.strip('{').strip('}')[:-1]
## If we find a dot which is not preceded by a backslash and not succeeded by a dash.
## Also, we shouldn't care about dots appearing within curle braces, so we have to
## check for that as well.
z = get_delim_levels(namestr, ('{','}'))
for match in re.finditer(r'(?<!\\)\.(?!-)', n_temp):
i = match.start()
if (z[i] == 0):
print('Warning: The name token "' + n + '" in namestring "' + namestr + \
'" has a "." inside it, which may be a typo. Ignoring ...')
namedict = {}
if (len(nametokens) == 1):
namedict['last'] = nametokens[0]
elif (len(nametokens) == 2):
namedict['first'] = nametokens[0]
namedict['last'] = nametokens[1]
elif (len(nametokens) > 2):
namedict['first'] = nametokens[0]
namedict['last'] = nametokens[-1]
namedict['middle'] = ' '.join(nametokens[1:-1])
namedict = search_middlename_for_prefixes(namedict)
elif (len(commapos) == 1):
namedict = {}
(firstpart, secondpart) = splitat(namestr, commapos)
first_nametokens = firstpart.strip().split(' ')
second_nametokens = secondpart.strip().split(' ')
if (len(first_nametokens) == 1):
namedict['last'] = first_nametokens[0]
elif (len(first_nametokens) > 1):
namedict['prefix'] = ' '.join(first_nametokens[0:-1])
namedict['last'] = first_nametokens[-1]
namedict['first'] = second_nametokens[0]
if (len(second_nametokens) > 1):
namedict['middle'] = ' '.join(second_nametokens[1:])
namedict = search_middlename_for_prefixes(namedict)
elif (len(commapos) == 2):
namedict = {}
(firstpart, secondpart, thirdpart) = splitat(namestr, commapos)
first_nametokens = firstpart.strip().split(' ')
second_nametokens = secondpart.strip().split(' ')
third_nametokens = thirdpart.strip().split(' ')
if (len(second_nametokens) != 1):
print('Warning: the BibTeX format for namestr="' + namestr + '" is malformed.' + \
'\nThere should be only one name in the second part of the three comma-' \
'separated name elements.')
return({'last':'???'})
if (len(first_nametokens) == 1):
namedict['last'] = first_nametokens[0]
elif (len(first_nametokens) == 2):
namedict['prefix'] = ' '.join(first_nametokens[0:-1])
namedict['last'] = first_nametokens[-1]
namedict['suffix'] = second_nametokens[0]
namedict['first'] = third_nametokens[0]
if (len(third_nametokens) > 1):
namedict['middle'] = ' '.join(third_nametokens[1:])
#if (len(namedict['middle']) == 1):
# namedict['middle'] = namedict['middle'][0]
namedict = search_middlename_for_prefixes(namedict)
elif (len(commapos) == 3):
## The name format is (first,middle,prefix,last).
namedict = {}
(firstpart, secondpart, thirdpart, fourthpart) = splitat(namestr, commapos)
namedict['first'] = firstpart.strip()
namedict['middle'] = secondpart.strip()
namedict['prefix'] = thirdpart.strip()
namedict['last'] = fourthpart.strip()
#namedict['suffix'] =
elif (len(commapos) == 4):
## The name format is (first,middle,prefix,last,suffix).
namedict = {}
(firstpart, secondpart, thirdpart, fourthpart, fifthpart) = splitat(namestr, commapos)
namedict['first'] = firstpart.strip()
namedict['middle'] = secondpart.strip()
namedict['prefix'] = thirdpart.strip()
namedict['last'] = fourthpart.strip()
namedict['suffix'] = fifthpart.strip()
else:
print('Warning: the BibTeX format for namestr="' + namestr + '" is malformed.' + \
'\nThere should never be more than four commas in a given name.')
return({'last':'???'})
## If any tokens in the middle name start with lower case, then move them, and any tokens after
## them, to the prefix. An important difference is that prefixes typically don't get converted
## to initials when generating the formatted bibliography string.
if ('middle' in namedict) and (namedict['middle'] != ''):
nametokens = namedict['middle'].split(' ')
if not isinstance(nametokens, list): nametokens = [nametokens]
nametokens = [x for x in nametokens if x]
move = False
movelist = []
removelist = []
for i,t in enumerate(nametokens):
if t[0].islower(): move = True
if move:
movelist.append(i)
removelist.append(i)
## Add the tokens to the prefix and remove them from the middle name.
if movelist:
if ('prefix' not in namedict): namedict['prefix'] = ''
movetokens = [t for i,t in enumerate(nametokens) if i in movelist]
namedict['prefix'] = ' '.join(movetokens) + ' ' + namedict['prefix']
namedict['prefix'] = namedict['prefix'].strip()
if removelist:
nametokens = [t for i,t in enumerate(nametokens) if i not in removelist]
namedict['middle'] = ' '.join(nametokens)
return(namedict)
## ===================================
[docs]def search_middlename_for_prefixes(namedict):
'''
From the middle name of a single person, check if any of the names should be placed into the
"prefix" and move them there.
Parameters
----------
namedict : dict
The dictionary containing the key "middle", containing the string with the person's \
middle names/initials.
Returns
-------
namedict : dict
The dictionary augmented with the key "prefix" if a prefix is indeed found.
'''
if ('middle' not in namedict):
return(namedict)
## Search the list of middle names from the back to the front. If the name encountered starts
## with lower case, then it is moved to namedict['prefix']. If not, then stop the search and
## break out of the loop.
middlenames = namedict['middle'].split(' ')
prefix = []
for m in middlenames[::-1]:
if m[0].islower():
#pdb.set_trace()
prefix.append(middlenames.pop())
else:
break
if prefix:
namedict['prefix'] = ' '.join(prefix)
namedict['middle'] = ' '.join(middlenames)
return(namedict)
## =============================
[docs]def create_edition_ordinal(bibentry, key, options):
'''
Given a bibliography entry's edition *number*, format it as an ordinal (i.e. "1st", "2nd" \
instead of "1", "2") in the way that it will appear on the formatted page.
Parameters
----------
bibdata : dict
The bibliography database dictionary (constructed from *.bib files).
key : str
The key in `bibdata` defining the current entry being formatted.
options : dict, optional
Includes formatting options such as
'missing_data_excaptions': Whether to raise an exception when encountering missing data \
errors.
Returns
-------
editionstr : str
The formatted form of the edition, with ordinal attached.
'''
## TODO: need to replace the English-locale-dependent approach here with an international
## friendly approach. Any ideas?
## First check that the key exists.
if (key not in bibentry):
return(options['undefstr'])
if not ('edition' in bibentry):
#print('Warning: cannot find "edition" in entry "' + key + '"')
return(options['undefstr'])
if not bibentry['edition'].isdigit():
return(bibentry['edition'])
edition_number = bibentry['edition']
trans = {'first':'1', 'second':'2', 'third':'3', 'fourth':'4', 'fifth':'5', 'sixth':'6',
'seventh':'7', 'eighth':'8', 'ninth':'10', 'tenth':'10', 'eleventh':'11',
'twelfth':'12', 'thirteenth':'13', 'fourteenth':'14', 'fifteenth':'15',
'sixteenth':'16', 'seventeenth':'17', 'eighteenth':'18', 'nineteenth':'19',
'twentieth':'20'}
if edition_number.lower() in trans:
edition_number = trans[edition_number.lower()]
## Add the ordinal string to the number.
if (edition_number == '1'):
editionstr = '1st'
elif (edition_number == '2'):
editionstr = '2nd'
elif (edition_number == '3'):
editionstr = '3rd'
elif (int(edition_number) > 3):
editionstr = edition_number + 'th'
else:
if ('edition' in bibentry):
print('Warning: the edition number "' + str(edition_number) + '" given for entry ' + \
key + ' is invalid. Ignoring')
editionstr = options['undefstr']
return(editionstr)
## =============================
[docs]def export_bibfile(bibdata, filename):
'''
Write a bibliography database dictionary into a .bib file.
Parameters
----------
filename : str
The filename of the file to write.
bibdata : dict
The bibliography dictionary to write out.
'''
assert isinstance(filename, basestring), 'Input "filename" must be a string.'
f = open(filename, 'w')
for key in bibdata:
entry = bibdata[key]
f.write('@' + entry['entrytype'].upper() + '{' + key + ',\n')
del bibdata[key]['entrytype']
nkeys = len(entry.keys())
## Write out the entries.
for i,k in enumerate(entry):
f.write(' ' + k + ' = {' + str(entry[k]) + '}')
## If this is the last field in the dictionary, then do not end the line with a
## trailing comma.
if (i == (nkeys-1)):
f.write('\n')
else:
f.write(',\n')
f.write('}\n\n')
f.close()
return
## =============================
[docs]def parse_nameabbrev(abbrevstr):
'''
Given a string containing either a single "name" > "abbreviation" pair or a list of such pairs,
parse the string into a dictionary of names and abbreviations.
Parameters
----------
abbrevstr : str
The string containing the abbreviation definitions.
Returns
-------
nameabbrev_dict : dict
A dictionary with names for keys and abbreviations for values.
'''
nameabbrev_dict = {}
abbrevlist = abbrevstr.split(',')
for a in abbrevlist:
pair = a.split(' > ')
pair[0] = pair[0].strip()
nameabbrev_dict[pair[0]] = pair[1].strip()
return(nameabbrev_dict)
## ==================================================================================================
if (__name__ == '__main__'):
print('sys.argv=', sys.argv)
if (len(sys.argv) > 1):
try:
(opts, args) = getopt.getopt(sys.argv[1:], '', ['locale='])
except getopt.GetoptError as err:
# print help information and exit:
print(err) # will print something like "option -a not recognized"
## Add some print statements here telling the user the basic interface to the function.
sys.exit(2)
for o,a in opts:
if (o == '--locale'):
locale = a
else:
assert False, "unhandled option"
## Use this command on the command line:
#/home/nh/Lab\ Notes/Bibulous/biblatex.py /home/nh/Publications/2013\ OE\ -\ Review\ of\ Snapshot/snapshot_review.tex /home/nh/Lab\ Notes/Bibulous/osa.bst
#arg_texfile = sys.argv[0]
arg_auxfile = args[0]
#if (arg_texfile[0] != '/'):
# arg_texfile = os.getcwd() + '/' + arg_texfile
#files = [arg_texfile, arg_auxfile]
files = arg_auxfile
else:
## Use the test example input.
arg_bibfile = './test/test1.bib'
arg_auxfile = './test/test1.aux'
arg_bstfile = './test/test1.bst'
files = [arg_bibfile, arg_auxfile, arg_bstfile]
bibdata = Bibdata(files, debug=True)
bibdata.write_bblfile()
print('Writing to BBL file = ' + bibdata.filedict['bbl'])
#os.system('kwrite ' + bibdata.filedict['bbl'])
print('DONE')