changelog shortlog tags changeset files revisions annotate raw

pdw/getdata/ngcoba.py

changeset 312: 67e05551b1e3
parent:9bef51f2c7b9
author: rgrp <http://rufuspollock.org>
date: Thu Jan 28 10:03:15 2010 +0000 (11 days ago)
permissions: -rw-r--r--
description: [contrib/stats.py][s]: searching for person counts by name.
1import os
2import re
3import logging
4from xml.dom.minidom import parse
5import BeautifulSoup as bs
6
7from pdw.name import titles
8
9logger = logging.getLogger('getdata.ngcoba')
10
11class AuthorEntryParser(object):
12 def __init__(self):
13 # will ignore entries such as:
14 # P AARTSZ (see: Anton <A HREF="pa.htm">PA</A>NNEKOEK)
15 # but this is OK because pseudonym is listed with main entry
16 titles_str = str('|'.join(titles))
17 self.line_regex = """
18 (?P<authorcode>[A-Z]\d+\s)?
19 (?:(?P<title>%(titles)s),)?
20 (?P<fullname> [^{]+ )\s+
21 (?P<country>[^(]*)
22 (?# start of dates)
23 (?:
24 \(
25 (?P<sex>[A|?|M|F]):\s*
26 (?:(?P<bdate> [^-]+))
27 \s-\s
28 (?: (?P<ddate> .+))
29 \)
30 )
31 """ % {'titles':titles_str}
32 # (?P<extra> (?:(?:(?!\s\() [^,]+) , )* )
33 self.line_pat = re.compile(self.line_regex, re.VERBOSE)
34
35 def parse_line(self, line):
36 mat = self.line_pat.match(line)
37 if mat is None:
38 return None
39 sections = {}
40 sections['authorcode'] = mat.group('authorcode')
41 if sections['authorcode']:
42 sections['authorcode'] = sections['authorcode'].strip()
43 sections['fullname'] = mat.group('fullname')
44 name, name_aka_str = NameParser().parse(mat.group('fullname'))
45 name.title = mat.group('title')
46 sections['name'] = name.norm()
47 sections['aka'] = name_aka_str
48 sections['raw_bdate'] = mat.group('bdate')
49 sections['bdate'] = OurDateParser().parse(sections['raw_bdate'])
50 sections['raw_ddate'] = mat.group('ddate')
51 sections['ddate'] = OurDateParser().parse(sections['raw_ddate'])
52 sections['country'] = CountryParser().parse(mat.group('country'))
53
54 # Cope with errant ngcoba2 records with duplicate authorcodes
55 if sections['authorcode'] and \
56 (sections['authorcode'] == u'B007808' and sections['raw_bdate'] == u'1688') or \
57 (sections['authorcode'] == u'N000777' and sections['fullname'] == u'John PETERS') or \
58 (sections['authorcode'] == u'R002268' and sections['country'] == u'CA'):
59 sections['authorcode'].append("a")
60
61 return sections
62
63from swiss.date import DateutilDateParser
64class OurDateParser(DateutilDateParser):
65
66 def parse(self, datestring):
67 try:
68 qualifier = None
69
70 # Cope with the case with '1942 or 1938'
71 ormatch = re.match('(\d+) or (\d+)', datestring)
72 if ormatch:
73 # use the 2nd date
74 qualifier = 'Ambiguous date: "%s"' % datestring
75 datestring = ormatch.group(2)
76
77 # Cope with the case with '(or Jun 22)'
78 ormatch = re.match('(.*)\(or[^)]+\)(.*)', datestring)
79 if ormatch:
80 # remove the '(or 5)'
81 qualifier = 'Ambiguous date: "%s"' % datestring
82 datestring = ''.join(ormatch.groups())
83
84 # check for another group the same
85 ormatch = re.match('(.*)\(or[^)]+\)(.*)', datestring)
86 if ormatch:
87 datestring = ''.join(ormatch.groups())
88
89 # Cope with '(wrongly Dec 14)'
90 if 'wrongly' in datestring:
91 # remove the '(wrongly 14)'
92 match = re.match('(.*)\(wrongly[^)]+\)(.*)', datestring)
93 if not match:
94 # remove 'June (wrongly) 14'
95 match = re.match('(.*) \w+ \(wrongly\).*', datestring)
96 '(.*) \w+ \(wrongly\).*'
97 if match:
98 datestring = ''.join(match.groups())
99 else:
100 assert "Couldn't handle 'wrongly' in: '%s'" % datestring
101
102 # Get parsing done by main parser in date.py
103 flexidate = DateutilDateParser.parse(self, datestring)
104
105 # Add qualifiers processed in this method
106 if flexidate and qualifier:
107 flexidate.qualifier += qualifier
108
109 # Convert to standardised string
110 flexidate_str = unicode(flexidate)
111 except Exception, inst:
112 logger.warn("Ignoring date due to exception %s exception: '%s'", repr(datestring), inst)
113 flexidate_str = u''
114 return flexidate_str
115
116from pdw.name import Name, NameParserBase, name_tostr
117class NameParser(NameParserBase):
118 def parse(self, fullname):
119 fullnames = []
120 # Cope with brackets e.g. '(Percy)'
121 without_match = re.match('(.*) ?\([^)]+\) ?(.*)', fullname)
122 if without_match:
123 without = ' '.join(without_match.groups()).strip()
124 with_match = re.match('(.*) ?\(([^)]+)\) ?(.*)', fullname)
125 if with_match:
126 with_ = ' '.join(with_match.groups()).strip()
127 else:
128 with_ = fullname
129 fullnames = [with_]
130 fullnames.append(without)
131
132 if fullnames:
133 name_ = NameParserBase.parse(self, fullnames[0])
134 aka_names_str = []
135 for fullname in fullnames[1:]:
136 aka_name = NameParserBase.parse(self, fullname)
137 aka_names_str.append(unicode(aka_name))
138 aka_names_str = ';'.join(aka_names_str)
139 return name_, aka_names_str
140 else:
141 return NameParserBase.parse(self, fullname), None
142
143 def _toparts(self, fullname):
144 # Cope with 'nee' TODO: deal with it better ...
145 fullname = fullname.replace('(<i>nee</i>)', '{nee}')
146 fullname = fullname.replace('(<i>ne<i>)', '{nee}')
147 fullname = fullname.replace('<i>nee</i>', '{nee}')
148 fullname = fullname.replace('<i>ne</i>', '{nee}')
149
150 # Cope with ', aka' section
151 if ', aka' in fullname:
152 fullname = fullname[:fullname.find(', aka')]
153
154 name = Name()
155
156 # Search for first word of 2+ letters all caps
157 match_obj = re.match('(?P<fns>.*?)(?P<ln> [A-Z]{2,20})(.*)', fullname)
158 if not match_obj:
159 # Search for final word of 1+ letters all caps
160 match_obj = re.match('(?P<fns>.+)?(?P<ln> [A-Z]+)', fullname)
161 if not match_obj:
162 # Search for only word of 1+ letters all caps
163 match_obj = re.match('(?P<fns>)(?P<ln>[A-Z]+)$', fullname)
164
165
166 # Build name object
167 if match_obj:
168 name.ln = match_obj.group('ln')
169 fns = match_obj.group('fns')
170 if fns is not None:
171 name.fns = fns.split()
172
173 return name
174
175class CountryParser(object):
176 country_pat = '.*{(?P<country>.*)}.*'
177 country_re = re.compile(country_pat)
178
179 def parse(self, country_str):
180 if not country_str.strip():
181 return None
182 match_object = self.country_re.match(country_str)
183 if not match_object:
184 logger.warn('Failed to match country string: %s', repr(country_str))
185 return None
186 return match_object.group('country')
187
188class WorkParser(object):
189 work_pat = r"""
190 (?P<gutenberg> <a.+</a>(\S+)?\s+)?
191 (?:[*['.])*\s*
192 (?P<title>[$(\w].*)\s*
193 (?# unfortunately the ps: never matches ... )
194 (\(ps:[^)]+\))?\s*
195 \[ (.*\D)?(?P<year>(\d+|\?))(?P<date_qualifier>\?)? \]
196 """
197 work_re = re.compile(work_pat, re.VERBOSE | re.UNICODE)
198
199 def parse(self, work_str):
200 mat = self.work_re.match(work_str)
201 if mat is None:
202 logger.warn('Failed to match work string: %s', repr(work_str))
203 return (None, None)
204 title = mat.group('title').strip()
205 year = mat.group('year')
206 if '(ps:' in title:
207 title = title[0:title.index('(ps:')].strip()
208 if year == '?':
209 year = None
210 return (title, year)
211
212# for loading and parsing ngcoba data
213class Loader(object):
214 isentry = re.compile('^\\w')
215
216 def load(self, fileobj):
217 logger.info('START Loading and parsing a file')
218 raw_entries = self.load_lines(fileobj)
219 results = []
220 for block_of_lines in raw_entries:
221 out = self.parse_raw_entry(block_of_lines)
222 if out is not None:
223 results.append(out)
224 logger.info('END Loading and parsing a file')
225 return results
226
227 def parse_raw_entry(self, block_of_lines):
228 entry = self.parse_entry(block_of_lines)
229 if entry is None: return
230 parser = AuthorEntryParser()
231 # should be integrated into parse
232 name_line = entry['name_line']
233 datadict = {}
234 try:
235 datadict = parser.parse_line(name_line)
236 except Exception, inst:
237 logger.warn("Skipping entry that caused exception in parsing: '%s' exception: '%s'", name_line.encode('utf8'), inst)
238 return
239 if datadict is None:
240 # ignore issues with aliases, flourishing dates and pseudonyms
241 if not( '(see: ' in name_line or ' fl ' in name_line or 'pseudonym' in
242 name_line):
243 logger.warn("Skipping name that could not be parsed: '%s'", name_line.encode('utf8'))
244 return
245 datadict['ps'] = entry['ps']
246 datadict['works'] = entry['works']
247 return datadict
248
249 def load_lines(self, fileobj):
250 '''Extract entries (block of lines ) from html file. (name + work listing) from html file.
251
252 @return: list of list of lines corresponding to a single author.
253 '''
254 dom = bs.BeautifulSoup(fileobj, convertEntities="html")
255 pre = dom.find('pre')
256 # pre has some <a href inside it (illegal but hey ...)
257 pre = str(pre)
258 pre = unicode(pre, 'utf8')
259 # open('debug.txt', 'w').write(pre)
260 results = []
261 # entry is more than name
262 current = []
263 for line in pre.split('\n'):
264 if line.strip() == '': # blank line separating entries
265 results.append(current)
266 current = []
267 elif 'pre>' not in line:
268 current.append(line)
269 return results
270
271 def parse_entry(self, entry_lines):
272 entry = {'name_line': None, 'ps': [], 'works': []}
273 if len(entry_lines) == 0:
274 return None
275 entry['name_line'] = entry_lines[0]
276 for line in entry_lines[1:]:
277 # yes you can ps;: (&ps;: Jan ORANJE)
278 out = re.match('^\(&?ps;?:(.*)\)', line)
279 if out:
280 pses = out.group(1).split(';')
281 entry['ps'] = [ ps.strip() for ps in pses ]
282 else: # assume it is a work
283 entry['works'].append(line.strip())
284 return entry
285
286 def load_to_db(self, fileobj):
287 import pdw.model as model
288 results = self.load(fileobj)
289 for rec in results:
290 if rec is None: continue
291 p = model.Person(name=rec['name'],
292 extras={
293 'source': u'ngcoba',
294 'original': rec['fullname'],
295 }
296 )
297 if rec['ps']:
298 p.aka='::'.join(rec['ps'])
299 if rec['bdate']:
300 p.birth_date = rec['raw_bdate']
301 p.birth_date_normed = rec['bdate']
302# p.birth_date_ordered = # TODO
303 if rec['ddate']:
304 p.death_date = rec['raw_ddate']
305 p.death_date_normed = rec['ddate']
306# p.death_date_ordered = # TODO
307 # if rec['sex']
308 work_parser = WorkParser()
309 for work_name in rec['works']:
310 title, year = work_parser.parse(work_name)
311 if title:
312 work = model.Work(
313 title=title,
314 date=year
315 )
316 p.works.append(work)
317 model.Session.commit()
318 model.Session.clear()
319
320