1
2
3 """
4 Various utilities.
5
6 """
7
8 __docformat__ = 'restructuredtext en'
9
10
11
12
13 import re
14
15 from bibrecord import PersonalName
16
17
18
19
20
21 EDITOR_PATS = [re.compile (x, flags=re.IGNORECASE+re.UNICODE) for x in
22 [
23 r'^edited by\s+',
24 r'\s*, editors\.?$',
25 r'^editors,?\s*',
26 ]
27 ]
28
29
30 STRIP_PATS = [re.compile (x, flags=re.IGNORECASE+re.UNICODE) for x in
31 [
32 r'^by\s+',
33 r'\s*;\s+with an introduction by .*$',
34 r'^\[\s*',
35 r'\s*\]$',
36 r'\.{3,}',
37 r'et[\. ]al\.',
38 r'\[',
39 r'\]',
40 r'\([^\)]+\)',
41 r'\s*;.*$',
42 ]
43 ]
44 AND_PAT = re.compile (r'\s+and\s+')
45 COLLAPSE_SPACE_RE = re.compile (r'\s+')
46
47 PUBLISHER_RES = [re.compile (p, flags=re.IGNORECASE+re.UNICODE) for p in
48 [
49 '^(?P<city>.*)\s*:\s*(?P<pub>.*)\s*,\s*c?(?P<year>\d{4})\.?$',
50 '^(?P<pub>.*)\.?$',
51 ]
52 ]
53
54
55
57 """
58 Remove formatting from an ISBN, making it suitable for web-queries.
59 """
60 return isbn.replace (' ', '').replace ('-', '').lower().strip()
61
62
64 """
65 Clean up an indivdual name into a more consistent format.
66 """
67 family = given = other = ''
68
69 name_str = COLLAPSE_SPACE_RE.sub (' ', name_str.strip())
70
71 if (', ' in name_str):
72
73 name_parts = name_str.split (', ', 1)
74 family = name_parts[0].strip()
75 given_other = name_parts[1].split (' ', 1)
76 given = given_other[0]
77 other = given_other[1:]
78 else:
79
80 name_parts = name_str.split (' ')
81 given = name_parts[0]
82 other_family = name_parts[1:]
83
84 if (other_family):
85 family = other_family[-1]
86 other = ' '.join (other_family[:-1])
87
88 if (family.endswith ('.')):
89 family = family[:-1]
90
91 name = PersonalName (given)
92 name.family = family or ''
93 name.other = other or ''
94
95 return name
96
97
99 """
100 Clean up a list of names into a more consistent format.
101
102 :Parameters:
103 name_str : string
104 The "author" attribute from a Xisbn record in XML.
105
106 :Returns:
107 A list of the authors in "reverse" format, e.g. "['Smith, A. B.',
108 'Jones, X. Y.']"
109
110 Xisbn data can be irregularly formatted, unpredictably including
111 ancillary information. This function attempts to cleans up the author field
112 into a list of consistent author names.
113
114 For example::
115
116 >>> n = parse_names ("Leonard Richardson and Sam Ruby.")
117 >>> print (n[0].family == 'Richardson')
118 True
119 >>> print (n[0].given == 'Leonard')
120 True
121 >>> print (not n[0].other)
122 True
123 >>> n = parse_names ("Stephen P. Schoenberger, Bali Pulendran")
124 >>> print (n[0].family == 'Schoenberger')
125 True
126 >>> print (n[0].given == 'Stephen')
127 True
128 >>> print (n[0].other == 'P.')
129 True
130 >>> n = parse_names ("Madonna")
131 >>> print (not n[0].family)
132 True
133 >>> print (n[0].given == 'Madonna')
134 True
135 >>> print (not n[0].other)
136 True
137
138 """
139
140
141
142
143
144
145 name_str = name_str.strip()
146 if (not name_str):
147 return []
148
149 for pat in STRIP_PATS:
150 name_str = pat.sub ('', name_str)
151 name_str = AND_PAT.sub (', ', name_str)
152
153 auth_list = name_str.split (', ')
154 name_list = [parse_single_name (x) for x in auth_list]
155
156 return name_list
157
158
160 """
161 Detect whethers names are editors and returns
162
163 Returns:
164 Whether editing information was recognised and the name with that
165 editing information removed.
166
167 For example::
168
169 >>> parse_editing_info ("Leonard Richardson and Sam Ruby.")
170 (False, 'Leonard Richardson and Sam Ruby.')
171 >>> parse_editing_info ("Ann Thomson.")
172 (False, 'Ann Thomson.')
173 >>> parse_editing_info ("Stephen P. Schoenberger, Bali Pulendran, editors.")
174 (True, 'Stephen P. Schoenberger, Bali Pulendran')
175 >>> print parse_editing_info ("Madonna")
176 (False, 'Madonna')
177
178 """
179
180
181 name_str = name_str.strip()
182 if (not name_str):
183 return False, ''
184
185
186 for pat in EDITOR_PATS:
187 match = pat.search (name_str)
188 if match:
189 return True, pat.sub ('', name_str)
190
191
192 return False, name_str
193
194
196 """
197 Parse a string of publisher information.
198
199 :Parameters:
200 pub_str : string
201 text giving publisher details.
202
203 :Returns:
204 A tuple of strings, being (<publisher>, <city of publication>,
205 <year of publication>). If no value is available, an empty string
206 returned.
207
208 As with author names, publication details are often inconsistently set out,
209 even in bibliographic data. This function attempts to parse out and
210 normalise the details.
211
212 For example::
213
214 >>> parse_publisher ('New York: Asia Pub. House, c1979.')
215 ('Asia Pub. House', 'New York', '1979')
216 >>> parse_publisher ('New York : LearningExpress, 1999.')
217 ('LearningExpress', 'New York', '1999')
218 >>> parse_publisher ('HarperTorch')
219 ('HarperTorch', '', '')
220 >>> parse_publisher ('Berkeley Heights, NJ: Enslow Publishers, c2000.')
221 ('Enslow Publishers', 'Berkeley Heights, NJ', '2000')
222
223 """
224 for re in PUBLISHER_RES:
225 match = re.search (pub_str)
226 if match:
227 fields = ['pub', 'city', 'year']
228 match_vals = match.groupdict (None)
229 return tuple ([match_vals.get (f, '').strip() for f in fields])
230 return '', '', ''
231
232
233
234
235
237 import doctest
238 doctest.testmod()
239
240
241
242
243 if __name__ == '__main__':
244 _doctest()
245
246
247
248