4from xml.dom.minidom import parse
5import BeautifulSoup as bs
7from pdw.name import titles
9logger = logging.getLogger('getdata.ngcoba')
11class AuthorEntryParser(object):
13 # will ignore entries such as:
14 # P AARTSZ (see: Anton <A HREF="pa.htm">PA</A>NNEKOEK)
15 # but this is OK because pseudonym is listed with main entry
16 titles_str = str('|'.join(titles))
18 (?P<authorcode>[A-Z]\d+\s)?
19 (?:(?P<title>%(titles)s),)?
20 (?P<fullname> [^{]+ )\s+
25 (?P<sex>[A|?|M|F]):\s*
31 """ % {'titles':titles_str}
32 # (?P<extra> (?:(?:(?!\s\() [^,]+) , )* )
33 self.line_pat = re.compile(self.line_regex, re.VERBOSE)
35 def parse_line(self, line):
36 mat = self.line_pat.match(line)
40 sections['authorcode'] = mat.group('authorcode')
41 if sections['authorcode']:
42 sections['authorcode'] = sections['authorcode'].strip()
43 sections['fullname'] = mat.group('fullname')
44 name, name_aka_str = NameParser().parse(mat.group('fullname'))
45 name.title = mat.group('title')
46 sections['name'] = name.norm()
47 sections['aka'] = name_aka_str
48 sections['raw_bdate'] = mat.group('bdate')
49 sections['bdate'] = OurDateParser().parse(sections['raw_bdate'])
50 sections['raw_ddate'] = mat.group('ddate')
51 sections['ddate'] = OurDateParser().parse(sections['raw_ddate'])
52 sections['country'] = CountryParser().parse(mat.group('country'))
54 # Cope with errant ngcoba2 records with duplicate authorcodes
55 if sections['authorcode'] and \
56 (sections['authorcode'] == u'B007808' and sections['raw_bdate'] == u'1688') or \
57 (sections['authorcode'] == u'N000777' and sections['fullname'] == u'John PETERS') or \
58 (sections['authorcode'] == u'R002268' and sections['country'] == u'CA'):
59 sections['authorcode'].append("a")
63from swiss.date import DateutilDateParser
64class OurDateParser(DateutilDateParser):
66 def parse(self, datestring):
70 # Cope with the case with '1942 or 1938'
71 ormatch = re.match('(\d+) or (\d+)', datestring)
74 qualifier = 'Ambiguous date: "%s"' % datestring
75 datestring = ormatch.group(2)
77 # Cope with the case with '(or Jun 22)'
78 ormatch = re.match('(.*)\(or[^)]+\)(.*)', datestring)
81 qualifier = 'Ambiguous date: "%s"' % datestring
82 datestring = ''.join(ormatch.groups())
84 # check for another group the same
85 ormatch = re.match('(.*)\(or[^)]+\)(.*)', datestring)
87 datestring = ''.join(ormatch.groups())
89 # Cope with '(wrongly Dec 14)'
90 if 'wrongly' in datestring:
91 # remove the '(wrongly 14)'
92 match = re.match('(.*)\(wrongly[^)]+\)(.*)', datestring)
94 # remove 'June (wrongly) 14'
95 match = re.match('(.*) \w+ \(wrongly\).*', datestring)
96 '(.*) \w+ \(wrongly\).*'
98 datestring = ''.join(match.groups())
100 assert "Couldn't handle 'wrongly' in: '%s'" % datestring
102 # Get parsing done by main parser in date.py
103 flexidate = DateutilDateParser.parse(self, datestring)
105 # Add qualifiers processed in this method
106 if flexidate and qualifier:
107 flexidate.qualifier += qualifier
109 # Convert to standardised string
110 flexidate_str = unicode(flexidate)
111 except Exception, inst:
112 logger.warn("Ignoring date due to exception %s exception: '%s'", repr(datestring), inst)
116from pdw.name import Name, NameParserBase, name_tostr
117class NameParser(NameParserBase):
118 def parse(self, fullname):
120 # Cope with brackets e.g. '(Percy)'
121 without_match = re.match('(.*) ?\([^)]+\) ?(.*)', fullname)
123 without = ' '.join(without_match.groups()).strip()
124 with_match = re.match('(.*) ?\(([^)]+)\) ?(.*)', fullname)
126 with_ = ' '.join(with_match.groups()).strip()
130 fullnames.append(without)
133 name_ = NameParserBase.parse(self, fullnames[0])
135 for fullname in fullnames[1:]:
136 aka_name = NameParserBase.parse(self, fullname)
137 aka_names_str.append(unicode(aka_name))
138 aka_names_str = ';'.join(aka_names_str)
139 return name_, aka_names_str
141 return NameParserBase.parse(self, fullname), None
143 def _toparts(self, fullname):
144 # Cope with 'nee' TODO: deal with it better ...
145 fullname = fullname.replace('(<i>nee</i>)', '{nee}')
146 fullname = fullname.replace('(<i>ne<i>)', '{nee}')
147 fullname = fullname.replace('<i>nee</i>', '{nee}')
148 fullname = fullname.replace('<i>ne</i>', '{nee}')
150 # Cope with ', aka' section
151 if ', aka' in fullname:
152 fullname = fullname[:fullname.find(', aka')]
156 # Search for first word of 2+ letters all caps
157 match_obj = re.match('(?P<fns>.*?)(?P<ln> [A-Z]{2,20})(.*)', fullname)
159 # Search for final word of 1+ letters all caps
160 match_obj = re.match('(?P<fns>.+)?(?P<ln> [A-Z]+)', fullname)
162 # Search for only word of 1+ letters all caps
163 match_obj = re.match('(?P<fns>)(?P<ln>[A-Z]+)$', fullname)
168 name.ln = match_obj.group('ln')
169 fns = match_obj.group('fns')
171 name.fns = fns.split()
175class CountryParser(object):
176 country_pat = '.*{(?P<country>.*)}.*'
177 country_re = re.compile(country_pat)
179 def parse(self, country_str):
180 if not country_str.strip():
182 match_object = self.country_re.match(country_str)
184 logger.warn('Failed to match country string: %s', repr(country_str))
186 return match_object.group('country')
188class WorkParser(object):
190 (?P<gutenberg> <a.+</a>(\S+)?\s+)?
192 (?P<title>[$(\w].*)\s*
193 (?# unfortunately the ps: never matches ... )
195 \[ (.*\D)?(?P<year>(\d+|\?))(?P<date_qualifier>\?)? \]
197 work_re = re.compile(work_pat, re.VERBOSE | re.UNICODE)
199 def parse(self, work_str):
200 mat = self.work_re.match(work_str)
202 logger.warn('Failed to match work string: %s', repr(work_str))
204 title = mat.group('title').strip()
205 year = mat.group('year')
207 title = title[0:title.index('(ps:')].strip()
212# for loading and parsing ngcoba data
214 isentry = re.compile('^\\w')
216 def load(self, fileobj):
217 logger.info('START Loading and parsing a file')
218 raw_entries = self.load_lines(fileobj)
220 for block_of_lines in raw_entries:
221 out = self.parse_raw_entry(block_of_lines)
224 logger.info('END Loading and parsing a file')
227 def parse_raw_entry(self, block_of_lines):
228 entry = self.parse_entry(block_of_lines)
229 if entry is None: return
230 parser = AuthorEntryParser()
231 # should be integrated into parse
232 name_line = entry['name_line']
235 datadict = parser.parse_line(name_line)
236 except Exception, inst:
237 logger.warn("Skipping entry that caused exception in parsing: '%s' exception: '%s'", name_line.encode('utf8'), inst)
240 # ignore issues with aliases, flourishing dates and pseudonyms
241 if not( '(see: ' in name_line or ' fl ' in name_line or 'pseudonym' in
243 logger.warn("Skipping name that could not be parsed: '%s'", name_line.encode('utf8'))
245 datadict['ps'] = entry['ps']
246 datadict['works'] = entry['works']
249 def load_lines(self, fileobj):
250 '''Extract entries (block of lines ) from html file. (name + work listing) from html file.
252 @return: list of list of lines corresponding to a single author.
254 dom = bs.BeautifulSoup(fileobj, convertEntities="html")
255 pre = dom.find('pre')
256 # pre has some <a href inside it (illegal but hey ...)
258 pre = unicode(pre, 'utf8')
259 # open('debug.txt', 'w').write(pre)
261 # entry is more than name
263 for line in pre.split('\n'):
264 if line.strip() == '': # blank line separating entries
265 results.append(current)
267 elif 'pre>' not in line:
271 def parse_entry(self, entry_lines):
272 entry = {'name_line': None, 'ps': [], 'works': []}
273 if len(entry_lines) == 0:
275 entry['name_line'] = entry_lines[0]
276 for line in entry_lines[1:]:
277 # yes you can ps;: (&ps;: Jan ORANJE)
278 out = re.match('^\(&?ps;?:(.*)\)', line)
280 pses = out.group(1).split(';')
281 entry['ps'] = [ ps.strip() for ps in pses ]
282 else: # assume it is a work
283 entry['works'].append(line.strip())
286 def load_to_db(self, fileobj):
287 import pdw.model as model
288 results = self.load(fileobj)
290 if rec is None: continue
291 p = model.Person(name=rec['name'],
294 'original': rec['fullname'],
298 p.aka='::'.join(rec['ps'])
300 p.birth_date = rec['raw_bdate']
301 p.birth_date_normed = rec['bdate']
302# p.birth_date_ordered = # TODO
304 p.death_date = rec['raw_ddate']
305 p.death_date_normed = rec['ddate']
306# p.death_date_ordered = # TODO
308 work_parser = WorkParser()
309 for work_name in rec['works']:
310 title, year = work_parser.parse(work_name)
317 model.Session.commit()
318 model.Session.clear()