Classes
Citator
CiteURL's main feature: a collection of schemas, and the tools to apply them to text, to find all kinds of citations in a text.
Attributes:
Name | Type | Description |
---|---|---|
schemas |
list |
A list of schema objects that this citator will try to match against. |
generic_id |
str |
A common regex the citator will append to each schema when it is loaded, to recognize a simple citation to the most-recently cited source. |
__init__(self, *yaml_paths, *, defaults=True, generic_id='\\b(Ib)?[Ii]d\\.(<\\/(i|em|u)>)?')
special
Calls load_yaml one or more times, to load the citator with schemas.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
defaults |
bool |
Whether to load CiteURL's default schemas |
True |
yaml_paths |
|
Additional YAML file paths, passed as individual arguments, that should be loaded to populate the citator's schemas. |
() |
generic_id |
str |
a common regex to append to all schemas, to recognize a simple citation to the most-recently cited source. Detects "id." or "ibid." by default. To disable, set to None. |
'\\b(Ib)?[Ii]d\\.(<\\/(i|em|u)>)?' |
Source code in citeurl/__init__.py
def __init__(
self,
*yaml_paths,
defaults: bool=True,
generic_id: str=GENERIC_ID
):
"""
Calls load_yaml one or more times, to load the citator with
schemas.
Arguments:
defaults: Whether to load CiteURL's default schemas
yaml_paths: Additional YAML file paths, passed as individual
arguments, that should be loaded to populate the
citator's schemas.
generic_id: a common regex to append to all schemas, to
recognize a simple citation to the most-recently cited
source. Detects "id." or "ibid." by default. To
disable, set to None.
"""
self.generic_id: str = generic_id
self.schemas: list = []
if defaults:
self.load_yaml(DEFAULT_YAML_PATH)
for path in yaml_paths:
self.load_yaml(path)
insert_links(self, text, attrs={'class': 'citation'}, url_optional=False, link_detailed_ids=True, link_plain_ids=False, id_break_regex='L\\. ?Rev\\.|J\\. ?Law|\\. ?([Cc]ode|[Cc]onst)', id_break_indices=[])
Convenience method to return a copy of the given text, with citation hyperlinks inserted.
If you plan to do more than just insert links, it's better to get a list of citations with list_citations first, then insert those links with the module-wide insert_links function.
Source code in citeurl/__init__.py
def insert_links(
self,
text: str,
attrs: dict={'class': 'citation'},
url_optional: bool=False,
link_detailed_ids: bool=True,
link_plain_ids: bool=False,
id_break_regex: str=DEFAULT_ID_BREAKS,
id_break_indices: list=[]) -> str:
"""
Convenience method to return a copy of the given text, with
citation hyperlinks inserted.
If you plan to do more than just insert links, it's better to
get a list of citations with list_citations first, then insert
those links with the module-wide insert_links function.
"""
citations = self.list_citations(
text,
id_break_regex=id_break_regex,
id_break_indices=id_break_indices
)
return insert_links(
citations,
text,
attrs=attrs,
link_detailed_ids=link_detailed_ids,
link_plain_ids=link_plain_ids,
url_optional=url_optional
)
list_authorities(self, text)
Convenience method to list all the authorities cited in a given text.
If you plan to do more than list authorities, it's better to get a list of citations with list_citations, then list the unique authorities with the module-wide list_authorities function.
Source code in citeurl/__init__.py
def list_authorities(self, text: str) -> list:
"""
Convenience method to list all the authorities cited in a
given text.
If you plan to do more than list authorities, it's better to
get a list of citations with list_citations, then list the
unique authorities with the module-wide list_authorities
function.
"""
citations = self.list_citations(text)
return list_authorities(citations)
list_citations(self, text, id_forms=True, id_break_regex='L\\. ?Rev\\.|J\\. ?Law|\\. ?([Cc]ode|[Cc]onst)', id_break_indices=[])
Scan a text and return a list of all citations in it, in order of appearance.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
id_forms |
bool |
Whether to detect citations like |
True |
id_break_regex |
str |
A pattern to look for in the text. Any occurrence of the pattern will interrupt a chain of "id." citations as if it were another citation. |
'L\\. ?Rev\\.|J\\. ?Law|\\. ?([Cc]ode|[Cc]onst)' |
id_break_indices |
list |
A list of positions in the text where "id." citations should be interrupted |
[] |
Returns:
Type | Description |
---|---|
list |
A list of citation objects, in order of appearance in the text. |
Source code in citeurl/__init__.py
def list_citations(self,
text: str,
id_forms: bool=True,
id_break_regex: str=DEFAULT_ID_BREAKS,
id_break_indices: list=[],
) -> list:
"""
Scan a text and return a list of all citations in it, in
order of appearance.
Arguments:
id_forms: Whether to detect citations like
"Id." and "Id. at 30."
id_break_regex: A pattern to look for in the text. Any
occurrence of the pattern will interrupt a chain of
"id." citations as if it were another citation.
id_break_indices: A list of positions in the text
where "id." citations should be interrupted
Returns:
A list of citation objects, in order of appearance in the
text.
"""
# First, get full citations:
citations = []
for schema in self.schemas:
citations += schema.get_citations(text)
shortform_cites = []
# Then, add shortforms
for citation in citations:
shortform_cites += citation._get_shortform_citations(text)
citations += shortform_cites
citations = _sort_and_remove_overlaps(citations)
if not id_forms: # no need to proceed
return citations
# determine where to break chains of id. citations
for citation in citations: # break at full or short citations
id_break_indices.append(citation.span[0])
if id_break_regex: #also break at specified regexes
matches = re.compile(id_break_regex).finditer(text)
for match in matches:
id_break_indices.append(match.span()[0])
id_break_indices = sorted(set(id_break_indices))
# loop through all citations to find their id citations
id_citations = []
for citation in citations:
# find the next id break point
i = -1
for index in id_break_indices:
i += 1
if index > citation.span[1]:
end_point = index
break
else:
end_point = None
id_break_indices = id_break_indices[i:]
# get each citation's id citations until the break point
id_citations += citation._get_id_citations(
text, end_point=end_point
)
return _sort_and_remove_overlaps(citations + id_citations)
load_yaml(self, path, use_generic_id=True)
Import schemas from the specified YAML file into the citator.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
path |
str |
path to the YAML file to load |
required |
use_generic_id |
bool |
Whether to append the citator's generic_id |
True |
Source code in citeurl/__init__.py
def load_yaml(self, path: str, use_generic_id: bool=True):
"""
Import schemas from the specified YAML file into the citator.
Arguments:
path: path to the YAML file to load
use_generic_id: Whether to append the citator's generic_id
citation format to the loaded schemas.
"""
yaml_text = Path(path).read_text()
yaml_nodes = safe_load(yaml_text)
for node in yaml_nodes:
new_schema = Schema(**node)
if use_generic_id and self.generic_id:
new_schema.idForms.append(self.generic_id)
self.schemas.append(new_schema)
lookup(self, query, broad=True)
Get Convenience method to get the first citation from the first matching schema, or None.
This is meant for cases where false positives are not an issue, so it uses broadRegex and case-insensitive matching by default.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
broad |
bool |
Whether to use case-insensitive regex matching and, if available, each schema's broadRegex. |
True |
query |
str |
The text to scan for a citation |
required |
Returns:
Type | Description |
---|---|
Citation |
A single citation object, or None |
Source code in citeurl/__init__.py
def lookup(self, query: str, broad: bool=True) -> Citation:
"""
Get Convenience method to get the first citation from the first
matching schema, or None.
This is meant for cases where false positives are not an issue,
so it uses broadRegex and case-insensitive matching by default.
Arguments:
broad: Whether to use case-insensitive regex matching and,
if available, each schema's broadRegex.
query: The text to scan for a citation
Returns:
A single citation object, or None
"""
for schema in self.schemas:
citation = next(schema.get_citations(query, broad=broad), None)
if citation:
return citation
return None
Schema
A pattern to recognize a single kind of citation and generate URLs from matches.
In most cases, it is more useful to use the Citator class to load schemas from YAML files and apply them en masse, rather than use the Schema class directly.
__init__(self, name, regex, URL=None, broadRegex=None, idForms=[], shortForms=[], defaults={}, mutations=[], substitutions=[], parent_citation=None, is_id=False)
special
Schema constructor. Primarily meant for use in loading YAML files and dynamically generating shortform schemas, but can be run directly if needed.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
name |
str |
The name of this schema |
required |
regex |
|
The pattern to recognize citations. Can be either a string, or a list of strings. In the latter case, they will be concatenated (without any separator) to form one string. In any case, the regex should include one or more named capture groups (i.e. "tokens") that will be used to generate the URL. |
required |
URL |
|
The template by which to generate URLs from citation matches. Placeholders in {curly braces} will be replaced by the value of the token with the same name, after that token has been processed with mutations and substitutions. The URL template can be provided either as as a string or as a list of strings to concatenate. In the latter case, if a list item contains a placeholder for which no value is set, the list item will be skipped. |
None |
defaults |
dict |
A dictionary of tokens and corresponding default values which should be set if the token's value is not otherwise set by a regex capture group. |
{} |
mutations |
list |
Dictionaries, each one representing a string
manipulation that should be performed on a token before
it is inserted into the URL template. Each mutation must
contain a key called The supported mutations are
|
[] |
substitutions |
list |
A list of dictionaries, each one representing
a lookup operation to modify the value of a token. Each
dict must contain By default, the value of If the inputToken does not match a key in the index,
the citation match fails, unless the substitution
specifies that You can also include |
[] |
shortForms |
list |
A list of regex templates to generate regexes that recognize short-forms of a parent long-form citation that has appeared earlier in the text. Any named section in {curly braces} will be replaced by
the value of the corresponding token from the parent
citation. So if a schema detects a longform citation to
"372 U.S. 335" and has a shortform Like the regex parameter, each shortform can be given either as a string or as a list of strings. |
[] |
idForms |
list |
Think "id.", not ID. Identical to shortForms, except that these regexes will only match until the next different citation or other interruption. |
[] |
parent_citation |
|
The citation, if any, that this schema was created as a shortform of. This argument is for dynamically-generated schemas, and there is usually no need to use it manually. |
None |
is_id |
|
Whether this schema represents an immediate repeat shortform citation like "id." or "id. at 30". Really only relevant for procedurally-generated schemas. |
False |
Source code in citeurl/__init__.py
def __init__(self,
name: str,
regex,
URL=None,
broadRegex=None,
idForms: list=[],
shortForms: list=[],
defaults: dict={},
mutations: list=[],
substitutions: list=[],
parent_citation=None,
is_id=False
):
"""
Schema constructor. Primarily meant for use in loading YAML
files and dynamically generating shortform schemas, but can be
run directly if needed.
Arguments:
name: The name of this schema
regex: The pattern to recognize citations. Can be either
a string, or a list of strings. In the latter case, they
will be concatenated (without any separator) to form one
string. In any case, the regex should include one or
more named capture groups (i.e. "tokens") that will be
used to generate the URL.
URL: The template by which to generate URLs from citation
matches. Placeholders in {curly braces} will be replaced
by the value of the token with the same name, after that
token has been processed with mutations and
substitutions.
The URL template can be provided either as as a string
or as a list of strings to concatenate. In the latter
case, if a list item contains a placeholder for which
no value is set, the list item will be skipped.
defaults: A dictionary of tokens and corresponding default
values which should be set if the token's value is not
otherwise set by a regex capture group.
mutations: Dictionaries, each one representing a string
manipulation that should be performed on a token before
it is inserted into the URL template. Each mutation must
contain a key called `token`, representing the token to
affect.
The supported mutations are `case`, `omit`, and the
combination of 'splitter' and 'joiner'. 'Case' forces
the token to the specified capitalization, either
"upper" or "lower".
`omit` is a string, parsed as regex, all occurrences
of which will be removed from the token.
`splitter` and `joiner` must be used together if at
all. The former is a string, parsed as regex, which will
split the token at each occurrence. Next, the 'joiner'
string will be placed between the pieces.
substitutions: A list of dictionaries, each one representing
a lookup operation to modify the value of a token. Each
dict must contain `token`, a string representing the
input token for the lookup. It must also contain `index`,
a dict of input values and their corresponding outputs.
By default, the value of `token` will be changed to the
value of the lookup. Alternatively, if you specify an
'outputToken', that token will be set instead, leaving
the input token unchanged. Note that 'outputToken' does
not need to exist in the original regex.
If the inputToken does not match a key in the index,
the citation match fails, unless the substitution
specifies that `allowUnmatched` is True, in which case a
failed substitution simply won't change any values.
You can also include `useRegex: true` to
make the dictionary lookup use regex matching rather
than normal string matching, but this feature is
experimental and likely buggy.
shortForms: A list of regex templates to generate regexes
that recognize short-forms of a parent long-form
citation that has appeared earlier in the text.
Any named section in {curly braces} will be replaced by
the value of the corresponding token from the parent
citation. So if a schema detects a longform citation to
"372 U.S. 335" and has a shortform `{volume} {reporter}
at (?P<pincite>\d+)`, it will generate the following
regex: `372 U.S. at (?P<pincite>\d+)`.
Like the regex parameter, each shortform can
be given either as a string or as a list of strings.
idForms: Think "id.", not ID. Identical to shortForms,
except that these regexes will only match until the
next different citation or other interruption.
parent_citation: The citation, if any, that this schema
was created as a shortform of. This argument is
for dynamically-generated schemas, and there is usually
no need to use it manually.
is_id: Whether this schema represents an immediate repeat
shortform citation like "id." or "id. at 30". Really
only relevant for procedurally-generated schemas.
"""
# Basic values
self.name: str = name
self.regex: str = _join_if_list(regex)
self.is_id: bool = is_id
if URL:
self.URL: str = URL if type(URL) is list else [URL]
# Supplemental regexes
self.broadRegex: str=_join_if_list(broadRegex) if broadRegex else None
self.idForms: list = [_join_if_list(r) for r in idForms]
self.shortForms: list = [_join_if_list(r) for r in shortForms]
# String operators
self.defaults: dict = defaults
try:
self.mutations: list = [self._Mutation(**m) for m in mutations]
except TypeError:
self.mutations: list = mutations
try:
self.substitutions: list = [
self._Substitution(**s)
for s in substitutions
]
except TypeError:
self.substitutions: list = substitutions
# Extra data for shortform citations
self.parent_citation: Citation = parent_citation
get_citations(self, text, broad=False, span=(0,))
Generator to return all citations the schema finds in text.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
The text to scan for a citation |
required |
broad |
bool |
Whether to use case-insensitive regex matching and, if available, the schema's broadRegex. |
False |
span |
tuple |
A tuple of one or two values determining the start and end index of where in the text to search for citations. Defaults to (0,) to scan the entire text. |
(0,) |
Returns:
Type | Description |
---|---|
Iterable |
Generator that yields each citation the schema finds in the text, or None. |
Source code in citeurl/__init__.py
def get_citations(
self,
text: str,
broad: bool=False,
span: tuple=(0,)
) -> Iterable:
"""
Generator to return all citations the schema finds in text.
Arguments:
text: The text to scan for a citation
broad: Whether to use case-insensitive regex matching and,
if available, the schema's broadRegex.
span: A tuple of one or two values determining
the start and end index of where in the text to search
for citations. Defaults to (0,) to scan the entire text.
Returns:
Generator that yields each citation the schema finds in the
text, or None.
"""
matches = self._compiled_re(broad).finditer(text, *span)
for match in matches:
try:
citation = Citation(match, self)
# skip citations where substitution failed:
except KeyError:
citation = None
if citation:
yield citation
return None
lookup(self, text, broad=True, span=(0,))
Returns the first citation it finds in the text, or None.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
text |
str |
The text to scan for a citation. |
required |
broad |
bool |
Whether to use case-insensitive regex matching and, if available, the schema's broadRegex. |
True |
span |
tuple |
A tuple of one or two values determining the start and end index of where in the text to search for citations. Defaults to (0,) to scan the entire text. |
(0,) |
Returns:
Type | Description |
---|---|
Citation |
The first citation this schema finds in the scanned text, or None. |
Source code in citeurl/__init__.py
def lookup(
self,
text: str,
broad: bool=True,
span: tuple=(0,)
) -> Citation:
"""
Returns the first citation it finds in the text, or None.
Arguments:
text: The text to scan for a citation.
broad: Whether to use case-insensitive regex matching
and, if available, the schema's broadRegex.
span: A tuple of one or two values determining
the start and end index of where in the text to search
for citations. Defaults to (0,) to scan the entire text.
Returns:
The first citation this schema finds in the scanned text,
or None.
"""
try:
return next(self.get_citations(text, broad=broad, span=span))
except:
return None
Citation
A single citation found in text.
Attributes:
Name | Type | Description |
---|---|---|
text |
str |
The text of the citation itself, like "42 USC ยง 1988(b)" |
span |
tuple |
The beginning and end positions of this citation in the source text. |
schema |
Schema |
The schema which recognized this citation |
tokens |
dict |
Dictionary of the named capture groups from the regex this citation matched. For "id." and "shortform" citations, this includes tokens carried over from the parent citation. |
processed_tokens |
dict |
Dictionary of tokens after they have been modified via mutations and substitutions. |
URL |
str |
The URL where a user can read this citation online |
__init__(self, match, schema)
special
For internal use. There should be no need to create citations by means other than a Citator or Schema object.
Source code in citeurl/__init__.py
def __init__(
self,
match: re.Match,
schema
):
"""
For internal use. There should be no need to create citations
by means other than a Citator or Schema object.
"""
self.span: tuple = match.span()
self.schema: Schema = schema
self.text: str = match.group(0)
# idForm and shortForm citations get values from parent citation
# except where their regexes include space for those values
if schema.parent_citation:
self.tokens: dict = dict(schema.parent_citation.tokens)
for key, val in match.groupdict().items():
self.tokens[key] = val
else:
self.tokens: dict = match.groupdict()
self.processed_tokens: dict = self.schema._process_tokens(self.tokens)
self.URL: str = self._get_url()
get_link(self, attrs={'class': 'citation'})
Return citation's link element, with given attributes
Source code in citeurl/__init__.py
def get_link(self, attrs: dict={'class': 'citation'}):
"""Return citation's link element, with given attributes"""
if self.URL:
attrs['href'] = self.URL
else:
del attrs['href'] # why is this necessary?
attr_str = ''
for key, value in attrs.items():
attr_str += ' %s="%s"' % (key, value)
return '<a%s>%s</a>' % (attr_str, self.text)
Authority
A single source cited one or more times in a text.
Attributes:
Name | Type | Description |
---|---|---|
defining_tokens |
dict |
A dictionary of tokens that define this authority, such that any citations with incompatible token values will not match it. Note that this uses processed_tokens (those which have been modified by the schema's mutations and substitutions). |
schema |
Schema |
The schema which found all the citations to this authority |
citations |
list |
The list of all the citations that refer to this authority. |
base_citation |
|
A citation object representing the hypothetical generic citation to this authority. |
name |
str |
The text of base_cite |
__init__(self, first_cite, allowed_differences=[])
special
Define an authority by providing a single long-form citation, and the list of tokens which, if present in the citation, should be discarded from the definition of the authority.
Generates a base_citation to represent the generic instance of this authority.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
first_cite |
|
A long-form citation object representing the first and archetypal citation to this authority. The first_cite will be added as the first entry in the authority's citation list, and it will be used as the basis to generate the authority's base_cite. |
required |
allowed_differences |
list |
A list of tokens whose values can differ among citations to the same authority |
[] |
Source code in citeurl/__init__.py
def __init__(self, first_cite, allowed_differences: list=[]):
"""
Define an authority by providing a single long-form citation,
and the list of tokens which, if present in the citation, should
be discarded from the definition of the authority.
Generates a base_citation to represent the generic instance of
this authority.
Arguments:
first_cite: A long-form citation object representing the
first and archetypal citation to this authority. The
first_cite will be added as the first entry in the
authority's citation list, and it will be used as the
basis to generate the authority's base_cite.
allowed_differences: A list of tokens whose values can
differ among citations to the same authority
"""
long_cite = first_cite._original_cite()
self.schema: Schema = long_cite.schema
self.citations: list = [first_cite]
# List the token values that distinguish this authority from
# others in the same schema. This uses processed tokens, not
# raw, so that a citation to "50 U.S. 5" will match
# a citation to "50 U. S. 5", etc.
self.defining_tokens: dict = {}
for t in first_cite.processed_tokens:
if (
first_cite.processed_tokens[t] != None
and t not in allowed_differences
):
self.defining_tokens[t] = first_cite.processed_tokens[t]
# Next, derive a base citation to represent this authority.
# If the first_citation to this authority isn't a longform, use
# whatever longform it's a child of.
try:
self.base_citation = self._derive_base_citation(long_cite)
except TypeError:
self.base_citation = first_cite
# Set other instance variables
self.name: str = self.base_citation.text
self.URL: str = self.base_citation.URL
# finally, give the first citation a reference to this authority
first_cite.authority = self
include(self, citation)
Adds the citation to this schema's list of citations. Also,
adds the authority
tag to the citation, referring back to this
authority.
Source code in citeurl/__init__.py
def include(self, citation):
"""Adds the citation to this schema's list of citations. Also,
adds the `authority` tag to the citation, referring back to this
authority."""
self.citations.append(citation)
citation.authority = self
matches(self, citation)
Checks whether a given citation matches the schema and defining tokens of this authority.
Source code in citeurl/__init__.py
def matches(self, citation) -> bool:
"""
Checks whether a given citation matches the schema and defining
tokens of this authority.
"""
if self.schema.name != citation.schema.name:
return False
for key, value in self.defining_tokens.items():
if (key not in citation.processed_tokens
or citation.processed_tokens[key] != value):
return False
return True