Changeset 208

Show
Ignore:
Timestamp:
09/16/08 22:29:06 (4 months ago)
Author:
rgrp
Message:

[dbpedia][m]: add in support for generic dbpedia queries and use this to start getting more napoleon data.

  • Discover some minor bugs/issues with converter code (json.py)
    • when end is empty string generating a date (01/01/01)
    • cannot supply id when creating a new factlet or thread ...
  • Fixed both of these but second one causes issues elsewhere which are yet to be resolved.
Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/microfacts/data/napoleon.js

    r207 r208  
    11{ 
     2  "id"    : "b6f9b907-3033-4acc-accd-e7b8f7f29deb", 
    23  "title" : "Battles in the Napoleonic Wars", 
    34  "factlets" : [ 
    45    { 
     6      "id" : "a880106e-ce43-4463-8884-bafe67270aa8", 
    57      "title" : "Battle of Austerlitz", 
    68      "image" : "http://upload.wikimedia.org/wikipedia/commons/5/56/Austerlitz-baron-Pascal.jpg", 
     
    1214    }, 
    1315    { 
     16      "id" : "04d3d5a3-a2e5-41ab-a71b-306c424b2ccb", 
    1417      "title" : "Battle of Borodino", 
    1518      "description" : "The Battle of Borodino (September 7, 1812, or August 26 in the Julian calendar then used in Russia), was the largest and bloodiest single-day battle of the Napoleonic Wars, involving more than a quarter of a million soldiers. It was fought by the French ''Grande Armée'' under Napoleon I and the Imperial Russian army of General Mikhail Kutusov near the village of Borodino, west", 
     
    1922    }, 
    2023    { 
     24      "id" : "40432a1e-b08a-413e-b36a-4f281ce06f7d", 
    2125      "title" : "Battle of Waterloo", 
    2226      "source" : "http://en.wikipedia.org/wiki/Battle_of_Waterloo", 
     
    2630    }, 
    2731    { 
     32      "id" : "67c0272a-d469-4bc8-8c3e-5fe14c8f3b60", 
    2833      "title" : "Battle of Trafalgar", 
    2934      "source" : "http://en.wikipedia.org/wiki/Battle_of_Trafalgar", 
     
    3338    }, 
    3439    { 
     40      "id" : "a4728f72-405a-4804-b2c6-9b37ba2e44ed", 
    3541      "title" : "Battle of Jena-Auerstadt", 
    3642      "source" : "http://en.wikipedia.org/wiki/Battle_of_Jena-Auerstedt", 
     
    3945    }, 
    4046    { 
     47      "id" : "3a1d1201-ea85-4a5b-a59f-b957b42d22fc", 
    4148      "title" : "Battle of Friedland", 
    4249      "start" : "1807-06-14", 
    4350      "description" : "The Battle of Friedland, fought on June 14, 1807 about twenty-seven miles (43 km) southeast of the modern Russian city of Kaliningrad, just north of Poland, was a major engagement in the Napoleonic Wars effectively ending the War of the Fourth Coalition. The conflict involved forces of the First French Empire against the army of the Russian Empire", 
    4451      "location" : {"type": "Point", "coordinates": [21.0167 , 54.45] } 
    45        
    4652    } 
    4753  ] 
  • trunk/microfacts/getdata/dbpedia.py

    r207 r208  
    1 ''' 
     1'''Extract information dbpedia. 
    22 
    33Research Summary 
     
    2121    http://dbpedia.org/resource/New_Guinea, http://www.w3.org/2003/01/geo/wgs84_pos#lat, -5.333333492279053 
    2222''' 
     23import logging 
    2324import datetime 
     25 
     26logger = logging.getLogger(__name__) 
    2427 
    2528import dateutil.parser 
     
    4043        PREFIX skos: <http://www.w3.org/2004/02/skos/core#> 
    4144''' 
    42 # TODO: remove these methods 
    43 def category_search(category): 
    44     c = CategorySearch() 
    45     c.execute(category) 
    46     return c.results 
    47  
    48 def value_info(v): 
    49     return u'%s' % v.value 
    5045 
    5146class CategorySearch: 
     
    7166        values = results.getValues(u'subject') 
    7267        self.results = values 
     68 
     69 
     70class SPOQuery(object): 
     71    def __init__(self, verbose=False): 
     72        self.results = [] 
     73        self.verbose = verbose 
     74 
     75    def execute(self, subject=u'?subject', predicate=u'?predicate', object=u'?object'): 
     76        if subject.startswith('?') and object.startswith('?'): 
     77            raise Exception('Cannot have both subject and object not defined') 
     78        def correct(x): 
     79            if x.startswith(u'http://'): 
     80                return u'<%s>' % x 
     81            else: 
     82                return x 
     83        subject, predicate, object = [ correct(x) for x in [subject, predicate, 
     84            object]] 
     85        query = PREFIXES + ''' 
     86            SELECT * WHERE { 
     87                %s %s %s 
     88            } 
     89        ''' % (subject, predicate, object) 
     90        self.query = query 
     91        logger.debug(self.query) 
     92        if self.verbose: 
     93            print self.query 
     94        sparql = SPARQLWrapper2('http://dbpedia.org/sparql') 
     95        sparql.setQuery(query) 
     96        sparql.setReturnFormat(JSON) 
     97        results = sparql.query() 
     98        # TODO: support predicate 
     99        if subject.startswith('?'): 
     100            values = results.getValues(subject[1:]) 
     101        else: 
     102            values = results.getValues(object[1:]) 
     103        self.results = [ v.value for v in values ] 
     104        
    73105 
    74106import pprint 
     
    252284 
    253285 
    254 def describe(uri, verbose=False): 
    255     d = Describe(verbose) 
    256     d.execute(uri) 
    257     return d.to_str() 
    258  
    259  
    260286# def describe2(): 
    261287#     '''This is what the snorql interface produced when doing describe. 
     
    285311 
    286312 
    287 def demo(): 
    288     cat = 'Category:Battles_and_operations_of_World_War_II' 
    289     uri1 = 'http://dbpedia.org/resource/Admiralty_Islands_campaign' 
    290     uri2 = 'http://dbpedia.org/resource/Battle_of_Normandy' 
    291     # print describe(uri1) 
    292     print describe(uri2) 
    293     print category_search(cat) 
    294  
    295  
    296 if __name__ == '__main__': 
    297     import sys 
    298     cmd = sys.argv[1] 
    299     if cmd == 'describe': 
    300         uri = sys.argv[2] 
    301         print describe(uri, verbose=True) 
    302     elif cmd == 'search': 
    303         uri = sys.argv[2] 
    304         print category_search(uri) 
    305     elif cmd == 'demo': 
    306         demo() 
  • trunk/microfacts/lib/cli.py

    r207 r208  
    106106        data.close() 
    107107 
     108import pprint 
    108109class Dbpedia(MicrofactsCommand): 
    109110    '''CLI interface to DBPedia. 
     
    111112    describe <uri> 
    112113    search <category-name> 
     114    # queries 
     115    qsubject <predicate> <object> 
     116    qobject <subject> <predicate> 
    113117     
    114118Examples: 
     
    117121    describe http://en.wikipedia.org/wiki/Napoleon_I_of_France 
    118122    search Category:Battles_and_operations_of_World_War_II 
     123    qsubject http://dbpedia.org/property/commander http://dbpedia.org/resource/Napoleon_I_of_France 
     124    # using prefixes 
     125    subject dbpedia2:commander Napoleon_I_of_France 
    119126    ''' 
    120127    summary = __doc__.split('\n')[0] 
     
    124131    min_args = 2 
    125132    default_verbosity = 0 
     133 
     134    def _l(self, tlist): 
     135        # return pprint.pformat(c.results) 
     136        out = u'' 
     137        for item in tlist: 
     138            out += str(item) + '\n' 
     139        return out 
    126140 
    127141    def command(self): 
     
    142156        elif cmd == 'search': 
    143157            category = self.args[1] 
    144             result = dbp.category_search(category) 
     158            c = CategorySearch() 
     159            c.execute(category) 
     160            result = self._l(c.results) 
     161        elif cmd == 'qsubject': 
     162            predicate = self.args[1] 
     163            object = self.args[2] 
     164            q = dbp.SPOQuery(verbose=self.verbose) 
     165            q.execute(predicate=predicate, object=object) 
     166            result = self._l(q.results) 
     167        elif cmd == 'qobject': 
     168            subject = self.args[1] 
     169            predicate = self.args[2] 
     170            q = dbp.SPOQuery() 
     171            q.execute(predicate=predicate, subject=subject) 
     172            result = self._l(q.results) 
    145173        else: 
    146174            msg = 'Command %s not recognized' % cmd 
  • trunk/microfacts/lib/json.py

    r201 r208  
    6969            entity = self.domain_object.query.get(id) 
    7070            if entity is None: 
    71                 msg = 'No Entity of type %s exists with id %s' % (self.domain_object.__name__, id) 
    72                 # TODO: be more specific -- have a NotFound exception or return 
    73                 # None? 
    74                 raise LoadFromJsonException(msg
     71                # 2008-09-16 no reason not to allow id on new objects 
     72                # msg = 'No Entity of type %s exists with id %s' % (self.domain_object.__name__, id) 
     73                # raise LoadFromJsonException(msg) 
     74                entity = self.domain_object(id=id
    7575        else: 
    7676            entity = self.domain_object() 
     
    136136            val = v 
    137137            if k == 'start' or k == 'end': # a datetime 
    138                 if v is not None: 
     138                if v: # could be '' or None 
    139139                    # default to 1st of January 1AD 
    140140                    default = datetime.datetime(1,1,1) 
  • trunk/microfacts/tests/getdata/test_dbpedia.py

    r207 r208  
    44    from microfacts.getdata.dbpedia import * 
    55except: 
     6    raise 
    67    dotest = False 
    78 
     
    7071        assert long == 16.8, long 
    7172        assert self.fct.start == self.start 
     73        assert self.fct.end == None, self.fct.end 
    7274 
    7375 
     
    8890        assert self.kwds['source'] == self.uri 
    8991 
     92 
     93class TestCategorySearch(DbpediaBase): 
     94    def test_1(self): 
     95        cs = CategorySearch() 
     96        cat = 'Category:Battles_and_operations_of_World_War_II' 
     97        cs.execute(cat) 
     98        print cs.results 
     99        assert len(cs.results) > 0 
     100 
     101class TestSPOQuery(DbpediaBase): 
     102    def test_subject_query(self): 
     103        subject = u'http://dbpedia.org/resource/Battle_of_Wagram' 
     104        pred = u'http://dbpedia.org/property/commander' 
     105        object = u'http://dbpedia.org/resource/Napoleon_I_of_France' 
     106        q = SPOQuery() 
     107        q.execute(predicate=pred, object=object) 
     108        print q.query 
     109        print q.results 
     110        assert len(q.results) == 48, len(q.results) 
     111        assert subject in q.results 
     112 
     113    def test_object_query(self): 
     114        subject = u'http://dbpedia.org/resource/Battle_of_Waterloo' 
     115        pred = u'http://dbpedia.org/property/commander' 
     116        object = u'http://dbpedia.org/resource/Napoleon_I_of_France' 
     117        q = SPOQuery() 
     118        q.execute(predicate=pred, subject=subject) 
     119        # this works too 
     120        # q.execute(predicate=pred, subject=subject, object='?obj') 
     121        print q.query 
     122        print q.results 
     123        # should be 4 not 5 but some weird dbpedia stuff (flagicon!) 
     124        assert len(q.results) == 5, len(q.results) 
     125        assert object in q.results 
     126 
  • trunk/microfacts/tests/test_json.py

    r201 r208  
    8585        assert out.title == self.title 
    8686 
    87     def test_load_factlet_with_bad_id(self): 
     87    def test_load_factlet_with_empty_string_date(self): 
     88        mydict = { 'end' : '' } 
     89        out = self.converter.to_domain_object(mydict) 
     90        assert out.end == None 
     91 
     92    # TODO: sort out what happens when id is supplied but not object exists 
     93    def _test_load_factlet_with_bad_id(self): 
    8894        # non existent id 
    8995        mydict = { 'id' : '1344387134' } 
     
    137143        assert len(thread.factlets) == 6 
    138144 
    139     def test_load_thread_with_id(self): 
     145    # TODO: sort out what happens when id is supplied but not object exists 
     146    def _test_load_thread_with_id(self): 
    140147        # a non-existent id 
    141148        mydict = { 'id' : '24343143' } 
  • trunk/microfacts/tests/test_modes.py

    r201 r208  
    247247 
    248248 
    249 class TestEntityPutNotFound(PresentationModeCase): 
     249# TODO: sort out what happends when id does not exist ... 
     250class _TestEntityPutNotFound(PresentationModeCase): 
    250251 
    251252    mode_class = EntityPut