root/trunk/microfacts/getdata/dbpedia.py

Revision 242, 12.1 kB (checked in by rgrp, 1 month ago)

[getdata][xs]: add another property to date list (p:death).

Line 
1 '''Extract information dbpedia.
2
3 Research Summary
4 ================
5
6 Many objects we are interested in do not have an explicit location but are
7 given as place names. How do we proceed with this? Do we simply use points or
8 move on to using locations.
9
10 Geodata
11 -------
12
13 Points are encoded either as locations using
14 http://www.georss.org/georss/point::
15
16     http://dbpedia.org/resource/New_Guinea, http://www.georss.org/georss/point, -5.33333333333 141.6
17
18 Or using wgs stuff::
19
20     http://dbpedia.org/resource/New_Guinea, http://www.w3.org/2003/01/geo/wgs84_pos#long, 141.6000061035156
21     http://dbpedia.org/resource/New_Guinea, http://www.w3.org/2003/01/geo/wgs84_pos#lat, -5.333333492279053
22 '''
23 import logging
24 import datetime
25
26 logger = logging.getLogger(__name__)
27
28 import dateutil.parser
29
30 import rdflib
31 from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON
32
33 PREFIXES = '''
34         PREFIX owl: <http://www.w3.org/2002/07/owl#>
35         PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
36         PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
37         PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
38         PREFIX foaf: <http://xmlns.com/foaf/0.1/>
39         PREFIX dc: <http://purl.org/dc/elements/1.1/>
40         PREFIX : <http://dbpedia.org/resource/>
41         PREFIX dbpedia2: <http://dbpedia.org/property/>
42         PREFIX dbpedia: <http://dbpedia.org/>
43         PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
44 '''
45
46 class CategorySearch:
47
48     def __init__(self):
49         self.results = []
50
51     def execute(self, category):
52         category = category.replace(' ', '_')
53         sparql = SPARQLWrapper2('http://dbpedia.org/sparql')
54         query = PREFIXES + '''
55             SELECT * WHERE {
56                 ?subject skos:subject <http://dbpedia.org/resource/%s>.
57             }
58         ''' % category
59         sparql.setQuery(query)
60         sparql.setReturnFormat(JSON)
61         results = sparql.query()
62         # results = results._convertRDF()
63         # for s,p,o in results:
64             # output += '%s %s %s' % (s,p,o)
65             # output += '\n'
66         values = results.getValues(u'subject')
67         self.results = values
68
69
70 class SPOQuery(object):
71     def __init__(self, verbose=False):
72         self.results = []
73         self.verbose = verbose
74
75     def execute(self, subject=u'?subject', predicate=u'?predicate', object=u'?object'):
76         if subject.startswith('?') and object.startswith('?'):
77             raise Exception('Cannot have both subject and object not defined')
78         def correct(x):
79             if x.startswith(u'http://'):
80                 return u'<%s>' % x
81             else:
82                 return x
83         subject, predicate, object = [ correct(x) for x in [subject, predicate,
84             object]]
85         query = PREFIXES + '''
86             SELECT * WHERE {
87                 %s %s %s
88             }
89         ''' % (subject, predicate, object)
90         self.query = query
91         logger.debug(self.query)
92         if self.verbose:
93             print self.query
94         sparql = SPARQLWrapper2('http://dbpedia.org/sparql')
95         sparql.setQuery(query)
96         sparql.setReturnFormat(JSON)
97         results = sparql.query()
98         # TODO: support predicate
99         if subject.startswith('?'):
100             values = results.getValues(subject[1:])
101         else:
102             values = results.getValues(object[1:])
103         self.results = [ v.value for v in values ]
104        
105
106 import pprint
107 import urllib
108 class Describe:
109
110     def __init__(self, verbose=False, recurse_for_location=False):
111         self.results = []
112         self.verbose = verbose
113         self.language = 'en'
114         self.recurse_for_location = recurse_for_location
115
116     def execute(self, uri):
117         if 'wikipedia.org' in uri:
118             uri = self._convert_wikipedia_url(uri)
119         self.uri = uri
120         query2 = PREFIXES + '''
121     DESCRIBE <%s>
122     ''' % uri
123         sparql = SPARQLWrapper2('http://dbpedia.org/sparql')
124         sparql.setQuery(query2)
125         # using describe does not result in usual result setup but a
126         # ConjunctiveGraph instead
127         # <http://rdflib.net/rdflib-2.4.0/html/public/rdflib.Graph.ConjunctiveGraph-class.html>
128         try:
129             self.results = sparql.query().convert()
130         except Exception, inst:
131             logger.warning('Had error on describe query: %s' % uri)
132
133     def _convert_wikipedia_url(self, url):
134         import urlparse
135         path = urlparse.urlparse(url)[2]
136         name = path.split('/')[-1]
137         # wikipedia allows all kinds of stuff that is not a valid url
138         name = urllib.quote(name)
139         dbpedia_uri = 'http://dbpedia.org/resource/' + name
140         return dbpedia_uri
141
142     def to_str(self):
143         output = self.uri + '\n\n'
144         if self.verbose:
145             for s,p,o in self.results:
146                 output += u'%s, %s, %s\n' % (s,p,o)
147             return output
148
149         output += pprint.pformat(self.extract())
150         return output
151
152     def __str__(self):
153         return self.to_str()
154
155     def extract(self):
156         kwds = { 'title' : None,
157                 'description' : None,
158                 'start' : '',
159                 'end' : '',
160                 'source': None,
161                 'place': [],
162                 'long': None,
163                 'lat': None,
164                 'image_urls': [],
165                 'license':  u'GFDL'
166                 }
167         for s,p,o in self.results:
168             # TODO: do we need to check subject of predicates is self.uri
169             # in some cases get self.uri as object as well ...
170             if s != self.uri:
171                 continue
172
173             # if str(p) == 'http://dbpedia.org/property/abstract':
174             if str(p) == 'http://www.w3.org/2000/01/rdf-schema#comment':
175                 if o.language == 'en':
176                     kwds['description'] = unicode(o)
177
178             # p:dateOfBirth 
179             # p:birthDate
180             # p:death (Tycho Brahe)
181             # http://dbpedia.org/property/date
182             elif (
183                     'date' in str(p) or 'Date' in str(p)
184                     or str(p) == 'http://dbpedia.org/property/death'
185                 ):
186                 dates = self.extract_dates(o)
187                 for newdate in dates:
188                     # may have multiple date properties ...
189                     if kwds['start']:
190                         if newdate < kwds['start']:
191                             kwds['end'] = kwds['start']
192                             kwds['start'] = newdate
193                         elif newdate > kwds['start']:
194                             kwds['end'] = newdate
195                         # o/w do nothing since same as start date
196                     else:
197                         kwds['start'] = newdate
198
199             elif str(p) == 'http://www.w3.org/2000/01/rdf-schema#label':
200                 if o.language == 'en':
201                     kwds['title'] = unicode(o)
202             # http://dbpedia.org/property/place
203             # http://dbpedia.org/property/birthPlace
204             # http://dbpedia.org/property/deathPlace
205             # elif 'place' in str(p) or 'Place' in str(p)
206             elif str(p) in [
207                 'http://dbpedia.org/property/place',
208                 'http://dbpedia.org/property/birthPlace',
209                 'http://dbpedia.org/property/deathPlace'
210                 ]:
211                 kwds['place'] = kwds['place'] + [unicode(o)]
212             # Also have georss:
213             # http://www.georss.org/georss/point, 49.1532555556 16.8764916667
214             # assume always have wgs84 so we don't need to check georss
215             elif str(p) == 'http://www.w3.org/2003/01/geo/wgs84_pos#long':
216                 kwds['long'] = float(o)
217             elif str(p) == 'http://www.w3.org/2003/01/geo/wgs84_pos#lat':
218                 kwds['lat'] = float(o)
219             # appears this is where the source wiki page is stored ...
220             elif str(p) == 'http://xmlns.com/foaf/0.1/page':
221                 kwds['source'] = unicode(o)
222             elif str(p) == 'http://xmlns.com/foaf/0.1/img':
223                 kwds['image_urls'] = kwds['image_urls'] + [unicode(o)]
224
225         # get lat/long indirectly ...
226         if self.recurse_for_location and kwds['place'] and not kwds['long']:
227             # need to retrieve long/lats from place
228             for place in kwds['place']:
229                 # TODO: only process places that are dbpedia uris ...
230                 # if not 'http://dbpedia' in place ...
231                 if self.verbose: # TODO: put this in debug
232                     print 'Processing place', place
233                 newd = Describe()
234                 newd.execute(place)
235                 out = newd.extract()
236                 # always assume if we have long have lat
237                 if out['long']:
238                     if self.verbose:
239                         print 'Using place data from', place
240                     # should probably record which place this came from somehow
241                     kwds['long'] = out['long']
242                     kwds['lat'] = out['lat']
243                     # once we have one set we don't need any more ...
244                     break
245         return kwds
246
247     def extract_dates(self, o):
248         # this is a NIGHTMARE
249         # get June 6, 1944 - mid-July 1944
250         # - can be some weird unicode mdash ...
251         # sometimes a proper date
252         # sometimes a uri as in: http://dbpedia.org/resource/1943-11-18
253         dates = []
254         try:
255             if isinstance(o, rdflib.URIRef):
256                 uristr = str(o)
257                 datestr = uristr.split('/')[-1]
258                 dates.append(dateutil.parser.parse(datestr))
259             else:
260                 o = o.toPython()
261                 if isinstance(o, datetime.date) or isinstance(o,
262                         datetime.datetime):
263                     dates.append(datetime.datetime(o.year, o.month, o.day))
264                 elif isinstance(o, basestring):
265                     newstr = o.replace(u'\u2013', '-')
266                     if '-' in newstr:
267                         start, end = newstr.split(' - ')
268                     else:
269                         start = newstr
270                         end = None
271                     try:
272                         d = dateutil.parser.parse(start)
273                         dates.append(d)
274                     except: # TODO: register error
275                         pass
276                     try:
277                         d = dateutil.parser.parse(end)
278                         dates.append(d)
279                     except:
280                         pass
281                 else:
282                     raise ValueError('%s is not a useable date object' % o)
283         except Exception, inst:
284             if self.verbose:
285                 print 'Problem with extracting date from: %s' % o
286                 try:
287                     print inst
288                 except:
289                     print 'Had exception but could not print it'
290         return dates
291
292     def to_factlet(self):
293         '''
294         @arg results: results of doing DESCRIBE.
295         '''
296         kwds = self.extract()
297         from microfacts.lib.converter import FactletConverter
298         conv = FactletConverter()
299         # convert kwds dates back to strings ...
300         kwds['start'] = str(kwds['start'])
301         kwds['end'] = str(kwds['end'])
302         # convert lists to non-lists (very crudely)
303         if kwds['image_urls']:
304             kwds['image'] = kwds['image_urls'][0]
305         if kwds['long']:
306             kwds['location'] = { 'type': 'Point', 'coordinates': None }
307             kwds['location']['coordinates'] = [ kwds['long'], kwds['lat'] ]
308         fct = conv.to_domain_object(kwds)
309         return fct
310
311
312 # def describe2():
313 #     '''This is what the snorql interface produced when doing describe.
314 #     
315 #     IMPOSSIBLY slow: must be because it is looking through whole graph ...
316 #     '''
317 #     query1 = PREFIXES + '''
318 # SELECT ?property ?hasValue ?isValueOf
319 # WHERE {
320 #   { <%s> ?property ?hasValue }
321 #   UNION
322 #   { ?isValueOf ?property <%s> }
323 # }
324 # ''' % (uri, uri)
325 #     query2 = PREFIXES + '''
326 # DESCRIBE <%s>
327 # ''' % uri
328 #     sparql = SPARQLWrapper2('http://dbpedia.org/sparql')
329 #     sparql.setQuery(query1)
330 #     # IMPOSSIBLY slow: must be because it is looking through whole graph ...
331 #     sparql.query()
332 #     print results.variables
333 #     # values = results.getValues(u'subject')
334 #     # for v in values:
335 #         # print v.type
336 #         # print v.value
337
338
Note: See TracBrowser for help on using the browser.