Changeset 187

Show
Ignore:
Timestamp:
08/16/08 22:01:22 (4 months ago)
Author:
rgrp
Message:

[shakespeare/stats][l]: implement basic statistical analysis of texts. Currently implemented as core standalone module and plus a WUI controller.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/shakespeare/model/dm.py

    Revision 176 Revision 187
    1""" 1""" 
    2Domain model 2Domain model 
    3 3 
    4Material contains all data we have including shakespeare texts. A text is taken 4Material contains all data we have including shakespeare texts. A text is taken 
    5to be a specific version of a work. e.g. the 1623 folio of King Richard III. 5to be a specific version of a work. e.g. the 1623 folio of King Richard III. 
    6 6 
    7We may in future add a Work object to refer to 'abstract' work of which a given 7We may in future add a Work object to refer to 'abstract' work of which a given 
    8text is a version. 8text is a version. 
    9""" 9""" 
    10from pylons import config 10from pylons import config 
    11from sqlalchemy import Column, MetaData, Table, types, ForeignKey 11from sqlalchemy import Column, MetaData, Table, types, ForeignKey 
    12from sqlalchemy import orm 12from sqlalchemy import orm 
    13from sqlalchemy.orm import relation, backref 13from sqlalchemy.orm import relation, backref 
    14 14 
    15# make sure config is registered 15# make sure config is registered 
    16import shakespeare 16import shakespeare 
    17shakespeare.conf() 17shakespeare.conf() 
    18 18 
    19metadata = MetaData() 19metadata = MetaData() 
    20Session = orm.scoped_session(orm.sessionmaker( 20Session = orm.scoped_session(orm.sessionmaker( 
    21    autoflush=True, 21    autoflush=True, 
    22    transactional=False, 22    transactional=False, 
    23    bind=config['pylons.g'].sa_engine 23    bind=config['pylons.g'].sa_engine 
    24)) 24)) 
    25 25 
    26import shakespeare 26import shakespeare 
    27import shakespeare.cache 27import shakespeare.cache 
    28 28 
    29# import other sqlobject items 29# import other sqlobject items 
    30from annotater.model import Annotation 30from annotater.model import Annotation 
    31import annotater.model 31import annotater.model 
    32 32 
    33material_table = Table('material', metadata, 33material_table = Table('material', metadata, 
    34    Column('id', types.Integer, primary_key=True), 34    Column('id', types.Integer, primary_key=True), 
    35    Column('name', types.String(255)), 35    Column('name', types.String(255)), 
    36    Column('title', types.String(255)), 36    Column('title', types.String(255)), 
    37    Column('creator', types.String(255)), 37    Column('creator', types.String(255)), 
    38    Column('url', types.String(255)), 38    Column('url', types.String(255)), 
    39    Column('notes', types.Text()) 39    Column('notes', types.Text()) 
    40    ) 40    ) 
    41 41 
    42# TODO: indices on word and occurences 42# TODO: indices on word and occurences 
    43statistic_table = Table('statistic', metadata, 43statistic_table = Table('statistic', metadata, 
    44    Column('id', types.Integer, primary_key=True), 44    Column('id', types.Integer, primary_key=True), 
    45    Column('material_id', types.Integer, ForeignKey('material.id')), 45    Column('material_id', types.Integer, ForeignKey('material.id')), 
    46    Column('word', types.String(50)), 46    Column('word', types.String(50)), 
    47    Column('occurrences', types.Integer, default=1), 47    Column('freq', types.Integer), 
    48    ) 48    ) 
    49 49 
    50 50 
    51from ConfigParser import SafeConfigParser 51from ConfigParser import SafeConfigParser 
    52 52 
    53 53 
    54 54 
    55class Material(object): 55class Material(object): 
    56    """Material related to Shakespeare (usually text of works and ancillary 56    """Material related to Shakespeare (usually text of works and ancillary 
    57    matter such as introductions). 57    matter such as introductions). 
    58 58 
    59    NB: can not use 'text' as class name as it is an sql reserved word 59    NB: can not use 'text' as class name as it is an sql reserved word 
    60 60 
    61    @attribute name: a unique name identifying the material 61    @attribute name: a unique name identifying the material 
    62     62     
    63    TODO: mutiple creators ?? 63    TODO: mutiple creators ?? 
    64    """ 64    """ 
    65 65 
    66    # TODO: remove (just here for sqlobject bkwards compat) 66    # TODO: remove (just here for sqlobject bkwards compat) 
    67    @classmethod 67    @classmethod 
    68    def byName(self, name): 68    def byName(self, name): 
    69        return self.query.filter_by(name=name).one() 69        return self.query.filter_by(name=name).first() 
    70     70     
    71    def get_text(self, format=None): 71    def get_text(self, format=None): 
    72        '''Get text (if any) associated with this material. 72        '''Get text (if any) associated with this material. 
    73 73 
    74        # ignore format for time being 74        # ignore format for time being 
    75        ''' 75        ''' 
    76        import pkg_resources 76        import pkg_resources 
    77        pkg = 'shksprdata' 77        pkg = 'shksprdata' 
    78        # default to plain txt format (TODO: generalise this) 78        # default to plain txt format (TODO: generalise this) 
    79        path = 'texts/%s.txt' % self.name 79        path = 'texts/%s.txt' % self.name 
    80        fileobj = pkg_resources.resource_stream(pkg, path) 80        fileobj = pkg_resources.resource_stream(pkg, path) 
    81        return fileobj 81        return fileobj 
    82 82 
    83    def get_cache_path(self, format): 83    def get_cache_path(self, format): 
    84        """Get path within cache to data file associated with this material. 84        """Get path within cache to data file associated with this material. 
    85        @format: the version ('plain', original='' etc) 85        @format: the version ('plain', original='' etc) 
    86        """ 86        """ 
    87        return shakespeare.cache.default.path(self.url, format) 87        return shakespeare.cache.default.path(self.url, format) 
    88 88 
    89    @classmethod 89    @classmethod 
    90    def load_from_metadata(self, fileobj): 90    def load_from_metadata(self, fileobj): 
    91        cfgp = SafeConfigParser() 91        cfgp = SafeConfigParser() 
    92        cfgp.readfp(fileobj) 92        cfgp.readfp(fileobj) 
    93        for section in cfgp.sections(): 93        for section in cfgp.sections(): 
    94            try: 94            item = Material.byName(section) 
    95                item = Material.byName(section) 95            if item is None: 
    96            except:   
    97                item = Material(name=section) 96                item = Material(name=section) 
      97            assert item is not None 
    98            for key, val in cfgp.items(section): 98            for key, val in cfgp.items(section): 
    99                setattr(item, key, val) 99                setattr(item, key, val) 
      100            Session.flush() 
    100 101 
    101class Statistic(object): 102class Statistic(object): 
    102    pass 103    pass 
    103 104 
    104# Map each domain model class to its corresponding relational table. 105# Map each domain model class to its corresponding relational table. 
    105mapper = Session.mapper 106mapper = Session.mapper 
    106mapper(Material, material_table) 107mapper(Material, material_table) 
    107mapper(Statistic, statistic_table, properties={ 108mapper(Statistic, statistic_table, properties={ 
    108    'text':relation(Material, backref='statistics') 109    'text':relation(Material, backref='statistics') 
    109    }) 110    }) 
    110 111 
  • trunk/shakespeare/tests/__init__.py

    Revision 150 Revision 187
    1"""Pylons application test package 1"""Pylons application test package 
    2 2 
    3When the test runner finds and executes tests within this directory, 3When the test runner finds and executes tests within this directory, 
    4this file will be loaded to setup the test environment. 4this file will be loaded to setup the test environment. 
    5 5 
    6It registers the root directory of the project in sys.path and 6It registers the root directory of the project in sys.path and 
    7pkg_resources, in case the project hasn't been installed with 7pkg_resources, in case the project hasn't been installed with 
    8setuptools. It also initializes the application via websetup (paster 8setuptools. It also initializes the application via websetup (paster 
    9setup-app) with the project's test.ini configuration file. 9setup-app) with the project's test.ini configuration file. 
    10""" 10""" 
    11import os 11import os 
    12import sys 12import sys 
    13 13 
    14import pkg_resources 14import pkg_resources 
    15import paste.fixture 15import paste.fixture 
    16import paste.script.appinstall 16import paste.script.appinstall 
    17from paste.deploy import loadapp 17from paste.deploy import loadapp 
    18from routes import url_for 18from routes import url_for 
    19 19 
    20__all__ = ['url_for', 'TestController'20__all__ = ['url_for', 'TestController', 'make_fixture'
    21 21 
    22here_dir = os.path.dirname(os.path.abspath(__file__)) 22here_dir = os.path.dirname(os.path.abspath(__file__)) 
    23conf_dir = os.path.dirname(os.path.dirname(here_dir)) 23conf_dir = os.path.dirname(os.path.dirname(here_dir)) 
    24 24 
    25sys.path.insert(0, conf_dir) 25sys.path.insert(0, conf_dir) 
    26pkg_resources.working_set.add_entry(conf_dir) 26pkg_resources.working_set.add_entry(conf_dir) 
    27pkg_resources.require('Paste') 27pkg_resources.require('Paste') 
    28pkg_resources.require('PasteScript') 28pkg_resources.require('PasteScript') 
    29 29 
    30test_file = os.path.join(conf_dir, 'test.ini') 30test_file = os.path.join(conf_dir, 'test.ini') 
    31cmd = paste.script.appinstall.SetupCommand('setup-app') 31cmd = paste.script.appinstall.SetupCommand('setup-app') 
    32cmd.run([test_file]) 32cmd.run([test_file]) 
    33 33 
      34sonnet18_text = \ 
      35'''Shall I compare thee to a summer's day? 
      36Thou art more lovely and more temperate: 
      37Rough winds do shake the darling buds of May, 
      38And summer's lease hath all too short a date: 
      39 
      40Sometime too hot the eye of heaven shines, 
      41And often is his gold complexion dimm'd, 
      42And every fair from fair sometime declines, 
      43By chance, or nature's changing course untrimm'd:  
      44 
      45But thy eternal summer shall not fade, 
      46Nor lose possession of that fair thou ow'st, 
      47Nor shall death brag thou wander'st in his shade, 
      48When in eternal lines to time thou grow'st, 
      49 
      50  So long as men can breathe, or eyes can see, 
      51  So long lives this, and this gives life to thee. 
      52''' 
      53 
      54# must use make_fixture rather than just create object as we need to be in 
      55# current db session 
      56def make_fixture(): 
      57    import shakespeare.model as model 
      58    sonnet18_name = 'test_sonnet18' 
      59    sonnet18 = model.Material.byName(sonnet18_name) 
      60    if not sonnet18: 
      61        sonnet18 = model.Material(name=sonnet18_name, 
      62                title='Sonnet 18', 
      63                ) 
      64        model.Session.flush() 
      65    sonnet18.content = sonnet18_text 
      66    return sonnet18 
      67 
      68 
    34class TestController(object): 69class TestController(object): 
    35 70 
    36    def __init__(self, *args, **kwargs): 71    def __init__(self, *args, **kwargs): 
    37        wsgiapp = loadapp('config:test.ini', relative_to=conf_dir) 72        wsgiapp = loadapp('config:test.ini', relative_to=conf_dir) 
    38        self.app = paste.fixture.TestApp(wsgiapp) 73        self.app = paste.fixture.TestApp(wsgiapp) 
  • trunk/shakespeare/tests/test_model.py

    Revision 176 Revision 187
    1import shakespeare.model as model 1import shakespeare.model as model 
    2 2 
    3class TestMaterial(object): 3class TestMaterial(object): 
    4 4 
    5    @classmethod 5    @classmethod 
    6    def setup_class(self): 6    def setup_class(self): 
    7        self.name = 'test-123' 7        self.name = 'test-123' 
    8        self.title = 'Hamlet' 8        self.title = 'Hamlet' 
    9        self.url = 'http://www.openshakespeare.org/blah.txt' 9        self.url = 'http://www.openshakespeare.org/blah.txt' 
    10        self.text = model.Material(name=self.name, 10        self.text = model.Material(name=self.name, 
    11                title=self.title, url=self.url) 11                title=self.title, url=self.url) 
    12        model.Session.flush() 12        model.Session.flush() 
    13        self.textid = self.text.id 13        self.textid = self.text.id 
    14        model.Session.clear() 14        model.Session.clear() 
    15 15 
    16    @classmethod 16    @classmethod 
    17    def teardown_class(self): 17    def teardown_class(self): 
    18        text = model.Material.query.get(self.textid) 18        text = model.Material.query.get(self.textid) 
    19        model.Session.delete(text) 19        model.Session.delete(text) 
    20        model.Session.flush() 20        model.Session.flush() 
    21     21     
    22    def test1(self): 22    def test1(self): 
    23        txt2 = model.Material.query.get(self.textid) 23        txt2 = model.Material.query.get(self.textid) 
    24        txt3 = model.Material.byName(self.name) 24        txt3 = model.Material.byName(self.name) 
    25        assert self.text.id == txt2.id 25        assert self.text.id == txt2.id 
    26        assert self.text.id == txt3.id 26        assert self.text.id == txt3.id 
    27     27     
    28    def test_get_cache_path(self): 28    def test_get_cache_path(self): 
    29        out = self.text.get_cache_path('plain') 29        out = self.text.get_cache_path('plain') 
    30        # do not want anything too specific or we end up duplicating cache_test 30        # do not want anything too specific or we end up duplicating cache_test 
    31        assert len(out) > 0 31        assert len(out) > 0 
    32 32 
    33    # TODO: set up fixtures before running this ... 33    # TODO: set up fixtures before running this ... 
    34    def _test_get_text(self): 34    def _test_get_text(self): 
    35        text = model.Material.byName('phoenix_and_the_turtle_gut') 35        text = model.Material.byName('phoenix_and_the_turtle_gut') 
    36        out = text.get_text() 36        out = text.get_text() 
    37        out = out.read() 37        out = out.read() 
    38        assert len(out) > 0 38        assert len(out) > 0 
    39        assert out[:26] == 'THE PHOENIX AND THE TURTLE' 39        assert out[:26] == 'THE PHOENIX AND THE TURTLE' 
    40 40 
    41 41 
    42class TestStatistic: 42class TestStatistic: 
    43 43 
    44    @classmethod 44    @classmethod 
    45    def setup_class(self): 45    def setup_class(self): 
    46        self.name = 'test-123' 46        self.name = 'test-123' 
    47        self.title = 'Hamlet' 47        self.title = 'Hamlet' 
    48        self.text = model.Material(name=self.name, title=self.title) 48        self.text = model.Material(name=self.name, title=self.title) 
    49        self.word = 'jones' 49        self.word = 'jones' 
    50        self.occurrences = 5 50        self.freq = 5 
    51        self.cc1 = model.Statistic( 51        self.cc1 = model.Statistic( 
    52                text=self.text, 52                text=self.text, 
    53                word=self.word, 53                word=self.word, 
    54                occurrences=self.occurrences 54                freq=self.freq 
    55                ) 55                ) 
    56        model.Session.flush() 56        model.Session.flush() 
    57        self.statid = self.cc1.id 57        self.statid = self.cc1.id 
    58        model.Session.clear() 58        model.Session.clear() 
    59 59 
    60    @classmethod 60    @classmethod 
    61    def teardown_class(self): 61    def teardown_class(self): 
    62        stat = model.Statistic.query.get(self.statid) 62        stat = model.Statistic.query.get(self.statid) 
    63        model.Session.delete(stat) 63        model.Session.delete(stat) 
    64        model.Session.delete(stat.text) 64        model.Session.delete(stat.text) 
    65        model.Session.flush() 65        model.Session.flush() 
    66        model.Session.remove() 66        model.Session.remove() 
    67 67 
    68    def test1(self): 68    def test1(self): 
    69        out1 = model.Statistic.query.get(self.statid) 69        out1 = model.Statistic.query.get(self.statid) 
    70        assert out1.text.name == self.name 70        assert out1.text.name == self.name 
    71        assert out1.occurrences == self.occurrences 71        assert out1.freq == self.freq 
    72 72 
    73    def test_select(self): 73    def test_select(self): 
    74        tresults = model.Statistic.query.filter_by(text=self.text 74        tresults = model.Statistic.query.filter_by(text=self.text 
    75                ).filter_by(word=self.word) 75                ).filter_by(word=self.word) 
    76        num = tresults.count() 76        num = tresults.count() 
    77        assert num == 1 77        assert num == 1 
    78 78