Package biblio :: Package webquery :: Module utils
[hide private]
[frames] | no frames]

Source Code for Module biblio.webquery.utils

  1  #! /usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  """ 
  4  Various utilities. 
  5   
  6  """ 
  7   
  8  __docformat__ = 'restructuredtext en' 
  9   
 10   
 11  ### IMPORTS ### 
 12   
 13  import re 
 14   
 15  from bibrecord import PersonalName 
 16   
 17   
 18  ### CONSTANTS & DEFINES ### 
 19   
 20  # patterns for extracting editors names 
 21  EDITOR_PATS = [re.compile (x, flags=re.IGNORECASE+re.UNICODE) for x in 
 22     [ 
 23        r'^edited by\s+',   # "(edited )by ..." 
 24        r'\s*, editors\.?$',     # "..., editors" 
 25        r'^editors,?\s*',        # "editors, ..."         
 26     ] 
 27  ] 
 28   
 29  # patterns for extracting author info 
 30  STRIP_PATS = [re.compile (x, flags=re.IGNORECASE+re.UNICODE) for x in 
 31     [ 
 32        r'^by\s+',   # "by ..." 
 33        r'\s*;\s+with an introduction by .*$', 
 34        r'^\[\s*',                
 35        r'\s*\]$', 
 36        r'\.{3,}',               # "..." 
 37        r'et[\. ]al\.',          # "et al." 
 38        r'\[', 
 39        r'\]', 
 40        r'\([^\)]+\)',  
 41        r'\s*;.*$', 
 42     ] 
 43  ] 
 44  AND_PAT = re.compile (r'\s+and\s+') 
 45  COLLAPSE_SPACE_RE = re.compile (r'\s+') 
 46   
 47  PUBLISHER_RES = [re.compile (p, flags=re.IGNORECASE+re.UNICODE) for p in 
 48     [ 
 49        '^(?P<city>.*)\s*:\s*(?P<pub>.*)\s*,\s*c?(?P<year>\d{4})\.?$', 
 50        '^(?P<pub>.*)\.?$', 
 51     ] 
 52  ] 
 53   
 54  ### IMPLEMENTATION ### 
 55   
56 -def normalize_isbn (isbn):
57 """ 58 Remove formatting from an ISBN, making it suitable for web-queries. 59 """ 60 return isbn.replace (' ', '').replace ('-', '').lower().strip()
61 62
63 -def parse_single_name (name_str):
64 """ 65 Clean up an indivdual name into a more consistent format. 66 """ 67 family = given = other = '' 68 # normalise space 69 name_str = COLLAPSE_SPACE_RE.sub (' ', name_str.strip()) 70 # break into parts 71 if (', ' in name_str): 72 # if [family], [given] [other] 73 name_parts = name_str.split (', ', 1) 74 family = name_parts[0].strip() 75 given_other = name_parts[1].split (' ', 1) 76 given = given_other[0] 77 other = given_other[1:] 78 else: 79 # if [given] [other] [family] 80 name_parts = name_str.split (' ') 81 given = name_parts[0] 82 other_family = name_parts[1:] 83 # the 'Madonna' clause 84 if (other_family): 85 family = other_family[-1] 86 other = ' '.join (other_family[:-1]) 87 # some tidying up 88 if (family.endswith ('.')): 89 family = family[:-1] 90 # create name 91 name = PersonalName (given) 92 name.family = family or '' 93 name.other = other or '' 94 ## Postconditions & return: 95 return name
96 97
98 -def parse_names (name_str):
99 """ 100 Clean up a list of names into a more consistent format. 101 102 :Parameters: 103 name_str : string 104 The "author" attribute from a Xisbn record in XML. 105 106 :Returns: 107 A list of the authors in "reverse" format, e.g. "['Smith, A. B.', 108 'Jones, X. Y.']" 109 110 Xisbn data can be irregularly formatted, unpredictably including 111 ancillary information. This function attempts to cleans up the author field 112 into a list of consistent author names. 113 114 For example:: 115 116 >>> n = parse_names ("Leonard Richardson and Sam Ruby.") 117 >>> print (n[0].family == 'Richardson') 118 True 119 >>> print (n[0].given == 'Leonard') 120 True 121 >>> print (not n[0].other) 122 True 123 >>> n = parse_names ("Stephen P. Schoenberger, Bali Pulendran") 124 >>> print (n[0].family == 'Schoenberger') 125 True 126 >>> print (n[0].given == 'Stephen') 127 True 128 >>> print (n[0].other == 'P.') 129 True 130 >>> n = parse_names ("Madonna") 131 >>> print (not n[0].family) 132 True 133 >>> print (n[0].given == 'Madonna') 134 True 135 >>> print (not n[0].other) 136 True 137 138 """ 139 # TODO: Xisbn authors fields are often appended with extra information 140 # like "with a foreword by" etc. Largely these are separated from the 141 # author list by semi-colons and so should be easy to strip off. 142 143 ## Preconditions & preparation: 144 # clean up string and return trivial cases 145 name_str = name_str.strip() 146 if (not name_str): 147 return [] 148 # strip extraneous and replace 'and' 149 for pat in STRIP_PATS: 150 name_str = pat.sub ('', name_str) 151 name_str = AND_PAT.sub (', ', name_str) 152 ## Main: 153 auth_list = name_str.split (', ') 154 name_list = [parse_single_name (x) for x in auth_list] 155 ## Postconditions & return: 156 return name_list
157 158
159 -def parse_editing_info (name_str):
160 """ 161 Detect whethers names are editors and returns 162 163 Returns: 164 Whether editing information was recognised and the name with that 165 editing information removed. 166 167 For example:: 168 169 >>> parse_editing_info ("Leonard Richardson and Sam Ruby.") 170 (False, 'Leonard Richardson and Sam Ruby.') 171 >>> parse_editing_info ("Ann Thomson.") 172 (False, 'Ann Thomson.') 173 >>> parse_editing_info ("Stephen P. Schoenberger, Bali Pulendran, editors.") 174 (True, 'Stephen P. Schoenberger, Bali Pulendran') 175 >>> print parse_editing_info ("Madonna") 176 (False, 'Madonna') 177 178 """ 179 ## Preconditions & preparation: 180 # clean up string and return trivial cases 181 name_str = name_str.strip() 182 if (not name_str): 183 return False, '' 184 ## Main: 185 # strip extraneous and replace 'and' 186 for pat in EDITOR_PATS: 187 match = pat.search (name_str) 188 if match: 189 return True, pat.sub ('', name_str) 190 ## Postconditions & return: 191 # no editting information found 192 return False, name_str
193 194
195 -def parse_publisher (pub_str):
196 """ 197 Parse a string of publisher information. 198 199 :Parameters: 200 pub_str : string 201 text giving publisher details. 202 203 :Returns: 204 A tuple of strings, being (<publisher>, <city of publication>, 205 <year of publication>). If no value is available, an empty string 206 returned. 207 208 As with author names, publication details are often inconsistently set out, 209 even in bibliographic data. This function attempts to parse out and 210 normalise the details. 211 212 For example:: 213 214 >>> parse_publisher ('New York: Asia Pub. House, c1979.') 215 ('Asia Pub. House', 'New York', '1979') 216 >>> parse_publisher ('New York : LearningExpress, 1999.') 217 ('LearningExpress', 'New York', '1999') 218 >>> parse_publisher ('HarperTorch') 219 ('HarperTorch', '', '') 220 >>> parse_publisher ('Berkeley Heights, NJ: Enslow Publishers, c2000.') 221 ('Enslow Publishers', 'Berkeley Heights, NJ', '2000') 222 223 """ 224 for re in PUBLISHER_RES: 225 match = re.search (pub_str) 226 if match: 227 fields = ['pub', 'city', 'year'] 228 match_vals = match.groupdict (None) 229 return tuple ([match_vals.get (f, '').strip() for f in fields]) 230 return '', '', ''
231 232 233 234 ### TEST & DEBUG ### 235
236 -def _doctest ():
237 import doctest 238 doctest.testmod()
239 240 241 ### MAIN ### 242 243 if __name__ == '__main__': 244 _doctest() 245 246 247 ### END ###################################################################### 248