| 1 |
'''Extract information dbpedia. |
|---|
| 2 |
|
|---|
| 3 |
Research Summary |
|---|
| 4 |
================ |
|---|
| 5 |
|
|---|
| 6 |
Many objects we are interested in do not have an explicit location but are |
|---|
| 7 |
given as place names. How do we proceed with this? Do we simply use points or |
|---|
| 8 |
move on to using locations. |
|---|
| 9 |
|
|---|
| 10 |
Geodata |
|---|
| 11 |
------- |
|---|
| 12 |
|
|---|
| 13 |
Points are encoded either as locations using |
|---|
| 14 |
http://www.georss.org/georss/point:: |
|---|
| 15 |
|
|---|
| 16 |
http://dbpedia.org/resource/New_Guinea, http://www.georss.org/georss/point, -5.33333333333 141.6 |
|---|
| 17 |
|
|---|
| 18 |
Or using wgs stuff:: |
|---|
| 19 |
|
|---|
| 20 |
http://dbpedia.org/resource/New_Guinea, http://www.w3.org/2003/01/geo/wgs84_pos#long, 141.6000061035156 |
|---|
| 21 |
http://dbpedia.org/resource/New_Guinea, http://www.w3.org/2003/01/geo/wgs84_pos#lat, -5.333333492279053 |
|---|
| 22 |
''' |
|---|
| 23 |
import logging |
|---|
| 24 |
import datetime |
|---|
| 25 |
|
|---|
| 26 |
logger = logging.getLogger(__name__) |
|---|
| 27 |
|
|---|
| 28 |
import dateutil.parser |
|---|
| 29 |
|
|---|
| 30 |
import rdflib |
|---|
| 31 |
from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON |
|---|
| 32 |
|
|---|
| 33 |
PREFIXES = ''' |
|---|
| 34 |
PREFIX owl: <http://www.w3.org/2002/07/owl#> |
|---|
| 35 |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> |
|---|
| 36 |
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> |
|---|
| 37 |
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> |
|---|
| 38 |
PREFIX foaf: <http://xmlns.com/foaf/0.1/> |
|---|
| 39 |
PREFIX dc: <http://purl.org/dc/elements/1.1/> |
|---|
| 40 |
PREFIX : <http://dbpedia.org/resource/> |
|---|
| 41 |
PREFIX dbpedia2: <http://dbpedia.org/property/> |
|---|
| 42 |
PREFIX dbpedia: <http://dbpedia.org/> |
|---|
| 43 |
PREFIX skos: <http://www.w3.org/2004/02/skos/core#> |
|---|
| 44 |
''' |
|---|
| 45 |
|
|---|
| 46 |
class CategorySearch: |
|---|
| 47 |
|
|---|
| 48 |
def __init__(self): |
|---|
| 49 |
self.results = [] |
|---|
| 50 |
|
|---|
| 51 |
def execute(self, category): |
|---|
| 52 |
category = category.replace(' ', '_') |
|---|
| 53 |
sparql = SPARQLWrapper2('http://dbpedia.org/sparql') |
|---|
| 54 |
query = PREFIXES + ''' |
|---|
| 55 |
SELECT * WHERE { |
|---|
| 56 |
?subject skos:subject <http://dbpedia.org/resource/%s>. |
|---|
| 57 |
} |
|---|
| 58 |
''' % category |
|---|
| 59 |
sparql.setQuery(query) |
|---|
| 60 |
sparql.setReturnFormat(JSON) |
|---|
| 61 |
results = sparql.query() |
|---|
| 62 |
# results = results._convertRDF() |
|---|
| 63 |
# for s,p,o in results: |
|---|
| 64 |
# output += '%s %s %s' % (s,p,o) |
|---|
| 65 |
# output += '\n' |
|---|
| 66 |
values = results.getValues(u'subject') |
|---|
| 67 |
self.results = values |
|---|
| 68 |
|
|---|
| 69 |
|
|---|
| 70 |
class SPOQuery(object): |
|---|
| 71 |
def __init__(self, verbose=False): |
|---|
| 72 |
self.results = [] |
|---|
| 73 |
self.verbose = verbose |
|---|
| 74 |
|
|---|
| 75 |
def execute(self, subject=u'?subject', predicate=u'?predicate', object=u'?object'): |
|---|
| 76 |
if subject.startswith('?') and object.startswith('?'): |
|---|
| 77 |
raise Exception('Cannot have both subject and object not defined') |
|---|
| 78 |
def correct(x): |
|---|
| 79 |
if x.startswith(u'http://'): |
|---|
| 80 |
return u'<%s>' % x |
|---|
| 81 |
else: |
|---|
| 82 |
return x |
|---|
| 83 |
subject, predicate, object = [ correct(x) for x in [subject, predicate, |
|---|
| 84 |
object]] |
|---|
| 85 |
query = PREFIXES + ''' |
|---|
| 86 |
SELECT * WHERE { |
|---|
| 87 |
%s %s %s |
|---|
| 88 |
} |
|---|
| 89 |
''' % (subject, predicate, object) |
|---|
| 90 |
self.query = query |
|---|
| 91 |
logger.debug(self.query) |
|---|
| 92 |
if self.verbose: |
|---|
| 93 |
print self.query |
|---|
| 94 |
sparql = SPARQLWrapper2('http://dbpedia.org/sparql') |
|---|
| 95 |
sparql.setQuery(query) |
|---|
| 96 |
sparql.setReturnFormat(JSON) |
|---|
| 97 |
results = sparql.query() |
|---|
| 98 |
# TODO: support predicate |
|---|
| 99 |
if subject.startswith('?'): |
|---|
| 100 |
values = results.getValues(subject[1:]) |
|---|
| 101 |
else: |
|---|
| 102 |
values = results.getValues(object[1:]) |
|---|
| 103 |
self.results = [ v.value for v in values ] |
|---|
| 104 |
|
|---|
| 105 |
|
|---|
| 106 |
import pprint |
|---|
| 107 |
import urllib |
|---|
| 108 |
class Describe: |
|---|
| 109 |
|
|---|
| 110 |
def __init__(self, verbose=False, recurse_for_location=False): |
|---|
| 111 |
self.results = [] |
|---|
| 112 |
self.verbose = verbose |
|---|
| 113 |
self.language = 'en' |
|---|
| 114 |
self.recurse_for_location = recurse_for_location |
|---|
| 115 |
|
|---|
| 116 |
def execute(self, uri): |
|---|
| 117 |
if 'wikipedia.org' in uri: |
|---|
| 118 |
uri = self._convert_wikipedia_url(uri) |
|---|
| 119 |
self.uri = uri |
|---|
| 120 |
query2 = PREFIXES + ''' |
|---|
| 121 |
DESCRIBE <%s> |
|---|
| 122 |
''' % uri |
|---|
| 123 |
sparql = SPARQLWrapper2('http://dbpedia.org/sparql') |
|---|
| 124 |
sparql.setQuery(query2) |
|---|
| 125 |
# using describe does not result in usual result setup but a |
|---|
| 126 |
# ConjunctiveGraph instead |
|---|
| 127 |
# <http://rdflib.net/rdflib-2.4.0/html/public/rdflib.Graph.ConjunctiveGraph-class.html> |
|---|
| 128 |
try: |
|---|
| 129 |
self.results = sparql.query().convert() |
|---|
| 130 |
except Exception, inst: |
|---|
| 131 |
logger.warning('Had error on describe query: %s' % uri) |
|---|
| 132 |
|
|---|
| 133 |
def _convert_wikipedia_url(self, url): |
|---|
| 134 |
import urlparse |
|---|
| 135 |
path = urlparse.urlparse(url)[2] |
|---|
| 136 |
name = path.split('/')[-1] |
|---|
| 137 |
# wikipedia allows all kinds of stuff that is not a valid url |
|---|
| 138 |
name = urllib.quote(name) |
|---|
| 139 |
dbpedia_uri = 'http://dbpedia.org/resource/' + name |
|---|
| 140 |
return dbpedia_uri |
|---|
| 141 |
|
|---|
| 142 |
def to_str(self): |
|---|
| 143 |
output = self.uri + '\n\n' |
|---|
| 144 |
if self.verbose: |
|---|
| 145 |
for s,p,o in self.results: |
|---|
| 146 |
output += u'%s, %s, %s\n' % (s,p,o) |
|---|
| 147 |
return output |
|---|
| 148 |
|
|---|
| 149 |
output += pprint.pformat(self.extract()) |
|---|
| 150 |
return output |
|---|
| 151 |
|
|---|
| 152 |
def __str__(self): |
|---|
| 153 |
return self.to_str() |
|---|
| 154 |
|
|---|
| 155 |
def extract(self): |
|---|
| 156 |
kwds = { 'title' : None, |
|---|
| 157 |
'description' : None, |
|---|
| 158 |
'start' : '', |
|---|
| 159 |
'end' : '', |
|---|
| 160 |
'source': None, |
|---|
| 161 |
'place': [], |
|---|
| 162 |
'long': None, |
|---|
| 163 |
'lat': None, |
|---|
| 164 |
'image_urls': [], |
|---|
| 165 |
'license': u'GFDL' |
|---|
| 166 |
} |
|---|
| 167 |
for s,p,o in self.results: |
|---|
| 168 |
# TODO: do we need to check subject of predicates is self.uri |
|---|
| 169 |
# in some cases get self.uri as object as well ... |
|---|
| 170 |
if s != self.uri: |
|---|
| 171 |
continue |
|---|
| 172 |
|
|---|
| 173 |
# if str(p) == 'http://dbpedia.org/property/abstract': |
|---|
| 174 |
if str(p) == 'http://www.w3.org/2000/01/rdf-schema#comment': |
|---|
| 175 |
if o.language == 'en': |
|---|
| 176 |
kwds['description'] = unicode(o) |
|---|
| 177 |
|
|---|
| 178 |
# p:dateOfBirth |
|---|
| 179 |
# p:birthDate |
|---|
| 180 |
# p:death (Tycho Brahe) |
|---|
| 181 |
# http://dbpedia.org/property/date |
|---|
| 182 |
elif ( |
|---|
| 183 |
'date' in str(p) or 'Date' in str(p) |
|---|
| 184 |
or str(p) == 'http://dbpedia.org/property/death' |
|---|
| 185 |
): |
|---|
| 186 |
dates = self.extract_dates(o) |
|---|
| 187 |
for newdate in dates: |
|---|
| 188 |
# may have multiple date properties ... |
|---|
| 189 |
if kwds['start']: |
|---|
| 190 |
if newdate < kwds['start']: |
|---|
| 191 |
kwds['end'] = kwds['start'] |
|---|
| 192 |
kwds['start'] = newdate |
|---|
| 193 |
elif newdate > kwds['start']: |
|---|
| 194 |
kwds['end'] = newdate |
|---|
| 195 |
# o/w do nothing since same as start date |
|---|
| 196 |
else: |
|---|
| 197 |
kwds['start'] = newdate |
|---|
| 198 |
|
|---|
| 199 |
elif str(p) == 'http://www.w3.org/2000/01/rdf-schema#label': |
|---|
| 200 |
if o.language == 'en': |
|---|
| 201 |
kwds['title'] = unicode(o) |
|---|
| 202 |
# http://dbpedia.org/property/place |
|---|
| 203 |
# http://dbpedia.org/property/birthPlace |
|---|
| 204 |
# http://dbpedia.org/property/deathPlace |
|---|
| 205 |
# elif 'place' in str(p) or 'Place' in str(p) |
|---|
| 206 |
elif str(p) in [ |
|---|
| 207 |
'http://dbpedia.org/property/place', |
|---|
| 208 |
'http://dbpedia.org/property/birthPlace', |
|---|
| 209 |
'http://dbpedia.org/property/deathPlace' |
|---|
| 210 |
]: |
|---|
| 211 |
kwds['place'] = kwds['place'] + [unicode(o)] |
|---|
| 212 |
# Also have georss: |
|---|
| 213 |
# http://www.georss.org/georss/point, 49.1532555556 16.8764916667 |
|---|
| 214 |
# assume always have wgs84 so we don't need to check georss |
|---|
| 215 |
elif str(p) == 'http://www.w3.org/2003/01/geo/wgs84_pos#long': |
|---|
| 216 |
kwds['long'] = float(o) |
|---|
| 217 |
elif str(p) == 'http://www.w3.org/2003/01/geo/wgs84_pos#lat': |
|---|
| 218 |
kwds['lat'] = float(o) |
|---|
| 219 |
# appears this is where the source wiki page is stored ... |
|---|
| 220 |
elif str(p) == 'http://xmlns.com/foaf/0.1/page': |
|---|
| 221 |
kwds['source'] = unicode(o) |
|---|
| 222 |
elif str(p) == 'http://xmlns.com/foaf/0.1/img': |
|---|
| 223 |
kwds['image_urls'] = kwds['image_urls'] + [unicode(o)] |
|---|
| 224 |
|
|---|
| 225 |
# get lat/long indirectly ... |
|---|
| 226 |
if self.recurse_for_location and kwds['place'] and not kwds['long']: |
|---|
| 227 |
# need to retrieve long/lats from place |
|---|
| 228 |
for place in kwds['place']: |
|---|
| 229 |
# TODO: only process places that are dbpedia uris ... |
|---|
| 230 |
# if not 'http://dbpedia' in place ... |
|---|
| 231 |
if self.verbose: # TODO: put this in debug |
|---|
| 232 |
print 'Processing place', place |
|---|
| 233 |
newd = Describe() |
|---|
| 234 |
newd.execute(place) |
|---|
| 235 |
out = newd.extract() |
|---|
| 236 |
# always assume if we have long have lat |
|---|
| 237 |
if out['long']: |
|---|
| 238 |
if self.verbose: |
|---|
| 239 |
print 'Using place data from', place |
|---|
| 240 |
# should probably record which place this came from somehow |
|---|
| 241 |
kwds['long'] = out['long'] |
|---|
| 242 |
kwds['lat'] = out['lat'] |
|---|
| 243 |
# once we have one set we don't need any more ... |
|---|
| 244 |
break |
|---|
| 245 |
return kwds |
|---|
| 246 |
|
|---|
| 247 |
def extract_dates(self, o): |
|---|
| 248 |
# this is a NIGHTMARE |
|---|
| 249 |
# get June 6, 1944 - mid-July 1944 |
|---|
| 250 |
# - can be some weird unicode mdash ... |
|---|
| 251 |
# sometimes a proper date |
|---|
| 252 |
# sometimes a uri as in: http://dbpedia.org/resource/1943-11-18 |
|---|
| 253 |
dates = [] |
|---|
| 254 |
try: |
|---|
| 255 |
if isinstance(o, rdflib.URIRef): |
|---|
| 256 |
uristr = str(o) |
|---|
| 257 |
datestr = uristr.split('/')[-1] |
|---|
| 258 |
dates.append(dateutil.parser.parse(datestr)) |
|---|
| 259 |
else: |
|---|
| 260 |
o = o.toPython() |
|---|
| 261 |
if isinstance(o, datetime.date) or isinstance(o, |
|---|
| 262 |
datetime.datetime): |
|---|
| 263 |
dates.append(datetime.datetime(o.year, o.month, o.day)) |
|---|
| 264 |
elif isinstance(o, basestring): |
|---|
| 265 |
newstr = o.replace(u'\u2013', '-') |
|---|
| 266 |
if '-' in newstr: |
|---|
| 267 |
start, end = newstr.split(' - ') |
|---|
| 268 |
else: |
|---|
| 269 |
start = newstr |
|---|
| 270 |
end = None |
|---|
| 271 |
try: |
|---|
| 272 |
d = dateutil.parser.parse(start) |
|---|
| 273 |
dates.append(d) |
|---|
| 274 |
except: # TODO: register error |
|---|
| 275 |
pass |
|---|
| 276 |
try: |
|---|
| 277 |
d = dateutil.parser.parse(end) |
|---|
| 278 |
dates.append(d) |
|---|
| 279 |
except: |
|---|
| 280 |
pass |
|---|
| 281 |
else: |
|---|
| 282 |
raise ValueError('%s is not a useable date object' % o) |
|---|
| 283 |
except Exception, inst: |
|---|
| 284 |
if self.verbose: |
|---|
| 285 |
print 'Problem with extracting date from: %s' % o |
|---|
| 286 |
try: |
|---|
| 287 |
print inst |
|---|
| 288 |
except: |
|---|
| 289 |
print 'Had exception but could not print it' |
|---|
| 290 |
return dates |
|---|
| 291 |
|
|---|
| 292 |
def to_factlet(self): |
|---|
| 293 |
''' |
|---|
| 294 |
@arg results: results of doing DESCRIBE. |
|---|
| 295 |
''' |
|---|
| 296 |
kwds = self.extract() |
|---|
| 297 |
from microfacts.lib.converter import FactletConverter |
|---|
| 298 |
conv = FactletConverter() |
|---|
| 299 |
# convert kwds dates back to strings ... |
|---|
| 300 |
kwds['start'] = str(kwds['start']) |
|---|
| 301 |
kwds['end'] = str(kwds['end']) |
|---|
| 302 |
# convert lists to non-lists (very crudely) |
|---|
| 303 |
if kwds['image_urls']: |
|---|
| 304 |
kwds['image'] = kwds['image_urls'][0] |
|---|
| 305 |
if kwds['long']: |
|---|
| 306 |
kwds['location'] = { 'type': 'Point', 'coordinates': None } |
|---|
| 307 |
kwds['location']['coordinates'] = [ kwds['long'], kwds['lat'] ] |
|---|
| 308 |
fct = conv.to_domain_object(kwds) |
|---|
| 309 |
return fct |
|---|
| 310 |
|
|---|
| 311 |
|
|---|
| 312 |
# def describe2(): |
|---|
| 313 |
# '''This is what the snorql interface produced when doing describe. |
|---|
| 314 |
# |
|---|
| 315 |
# IMPOSSIBLY slow: must be because it is looking through whole graph ... |
|---|
| 316 |
# ''' |
|---|
| 317 |
# query1 = PREFIXES + ''' |
|---|
| 318 |
# SELECT ?property ?hasValue ?isValueOf |
|---|
| 319 |
# WHERE { |
|---|
| 320 |
# { <%s> ?property ?hasValue } |
|---|
| 321 |
# UNION |
|---|
| 322 |
# { ?isValueOf ?property <%s> } |
|---|
| 323 |
# } |
|---|
| 324 |
# ''' % (uri, uri) |
|---|
| 325 |
# query2 = PREFIXES + ''' |
|---|
| 326 |
# DESCRIBE <%s> |
|---|
| 327 |
# ''' % uri |
|---|
| 328 |
# sparql = SPARQLWrapper2('http://dbpedia.org/sparql') |
|---|
| 329 |
# sparql.setQuery(query1) |
|---|
| 330 |
# # IMPOSSIBLY slow: must be because it is looking through whole graph ... |
|---|
| 331 |
# sparql.query() |
|---|
| 332 |
# print results.variables |
|---|
| 333 |
# # values = results.getValues(u'subject') |
|---|
| 334 |
# # for v in values: |
|---|
| 335 |
# # print v.type |
|---|
| 336 |
# # print v.value |
|---|
| 337 |
|
|---|
| 338 |
|
|---|