Changeset 2

Show
Ignore:
Timestamp:
05/09/06 20:01:41 (3 years ago)
Author:
rgrp
Message:

Another major change involving adding a concordance webpage (+ associated
changes) and extensive refactoring of the index.

Index
=====

Concordance
===========

  • Create webpage at /concordance
    • cherrpy_handler.py
    • template/concordance.html
  • concordancer.py: make_concordance
    • Refactor concordancer code to allow one to specify the exact works to add to concordancer and to specify the file to save in
  • concordancer.py: Concordancer add words_to_ignore class variable listing
    words to ignore
Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/bin/shakespeare-admin

    Revision 1 Revision 2
    1#!/usr/bin/env python 1#!/usr/bin/env python 
    2 2 
    3import cmd 3import cmd 
    4import StringIO 4import StringIO 
    5 5 
    6from shakespeare.format import GutenbergShakespeare 6from shakespeare.format import GutenbergShakespeare 
    7from shakespeare.download import download_all_shakespeare   
    8from shakespeare.download import make_index   
    9from shakespeare.concordancer import make_concordancer 7from shakespeare.concordancer import make_concordancer 
    10import shakespeare.utils as utils 8import shakespeare.utils as utils 
      9import shakespeare.work 
    11 10 
    12class ShakespeareAdmin(cmd.Cmd): 11class ShakespeareAdmin(cmd.Cmd): 
    13 12 
    14    def __init__(self): 13    def __init__(self): 
    15        cmd.Cmd.__init__(self) # cmd.Cmd is not a new style class 14        cmd.Cmd.__init__(self) # cmd.Cmd is not a new style class 
    16        self._index = make_index() 15        self._index = shakespeare.work.index.all 
    17 16 
    18    def do_format(self, line=None): 17    def do_format(self, line=None): 
    19        path = sys.argv[1] 18        path = sys.argv[1] 
    20        x = GutenbergShakespeare(file(path)) 19        x = GutenbergShakespeare(file(path)) 
    21        print x.extract_text() 20        print x.extract_text() 
    22     21     
    23    def help_format(self, line=None): 22    def help_format(self, line=None): 
    24        usage = \ 23        usage = \ 
    25'''Format a raw gutenberg text. 24'''Format a raw gutenberg text. 
    26 25 
    27Take a raw gutenberg text in and return (on stdout) the core text (i.e. strip 26Take a raw gutenberg text in and return (on stdout) the core text (i.e. strip 
    28out all the gutenberg bumpf)''' 27out all the gutenberg bumpf)''' 
    29        print usage 28        print usage 
    30 29 
    31    def do_format_all(self, line): 30    def do_format_all(self, line): 
    32        index = self._index  31        index = self._index  
    33        for item in index: 32        for item in index: 
    34            url = item[1] 33            url = item[1] 
    35            src = utils.get_local_path(url) 34            src = utils.get_local_path(url) 
    36            dest = utils.get_local_path(url, 'cleaned') 35            dest = utils.get_local_path(url, 'cleaned') 
    37            infile = file(src) 36            infile = file(src) 
    38            if src.endswith('wssnt10.txt'): # if it is the sonnets need a hack 37            if src.endswith('wssnt10.txt'): # if it is the sonnets need a hack 
    39                # delete last 140 characters 38                # delete last 140 characters 
    40                tmp1 = infile.read() 39                tmp1 = infile.read() 
    41                infile = StringIO.StringIO(tmp1[:-120]) 40                infile = StringIO.StringIO(tmp1[:-120]) 
    42            formatter = GutenbergShakespeare(infile) 41            formatter = GutenbergShakespeare(infile) 
    43            ff = file(dest, 'w') 42            ff = file(dest, 'w') 
    44            out = formatter.extract_text() 43            out = formatter.extract_text() 
    45            ff.write(out) 44            ff.write(out) 
    46            ff.close() 45            ff.close() 
    47     46     
    48    def do_download_texts(self, line): 47    def do_download_texts(self, line): 
    49        download_all_shakespeare()  48         for item in self._index: 
       49             utils.download_url(item[1]) 
    50 50 
    51    def help_download_texts(self, line=None): 51    def help_download_texts(self, line=None): 
    52        print download_all_shakespeare.__doc__  52         usage = \ 
       53 """ 
       54 Download from Project Gutenberg all the shakespeare texts listed in the index.""" 
       55         print usage  
    53     56     
    54    def do_print_index(self, line): 57    def do_print_index(self, line): 
    55        for row in self._index: 58        for row in self._index: 
    56            print row 59            print row 
    57 60 
    58    def help_print_index(self, line=None): 61    def help_print_index(self, line=None): 
    59        usage = \ 62        usage = \ 
    60'''Print index of Shakespeare texts to stdout''' 63'''Print index of Shakespeare texts to stdout''' 
    61        print usage 64        print usage 
    62 65 
    63    def do_make_concordancer(self, line=None): 66    def do_make_concordancer(self, line=None): 
    64        make_concordancer()  67         if line is not None: 
       68             textsToAdd = [] 
       69             textsUrls = line.split() 
       70             for item in self._index: 
       71               if item[1] in textsUrls: 
       72                   textsToAdd.append(item) 
       73             make_concordancer(textsToAdd) 
       74         else: 
       75             make_concordancer() 
    65 76 
    66    def help_make_concordancer(self, line=None): 77    def help_make_concordancer(self, line=None): 
    67        print make_concordance.__doc__ 78        print make_concordance.__doc__ 
    68 79 
    69    def do_help(self, line=None): 80    def do_help(self, line=None): 
    70        cmd.Cmd.do_help(self, line) 81        cmd.Cmd.do_help(self, line) 
    71 82 
    72    def do_quit(self): 83    def do_quit(self): 
    73        sys.exit() 84        sys.exit() 
    74 85 
    75    def do_EOF(self, *args): 86    def do_EOF(self, *args): 
    76        print '' 87        print '' 
    77        sys.exit() 88        sys.exit() 
    78 89 
    79if __name__ == '__main__': 90if __name__ == '__main__': 
    80    import sys 91    import sys 
    81    usage = """ 92    usage = """ 
    82%prog cmd 93%prog cmd 
    83 94 
    84    format-all: format all gutenberg etexts automatically 95    format-all: format all gutenberg etexts automatically 
    85    download: download the gutenberg etexts and store them in the cache 96    download: download the gutenberg etexts and store them in the cache 
    86    """ 97    """ 
    87    adminCmd = ShakespeareAdmin() 98    adminCmd = ShakespeareAdmin() 
    88    if len(sys.argv) < 2: 99    if len(sys.argv) < 2: 
    89        while 1: 100        while 1: 
    90            try: 101            try: 
    91                adminCmd.cmdloop() 102                adminCmd.cmdloop() 
    92                break 103                break 
    93            except KeyboardInterrupt: 104            except KeyboardInterrupt: 
    94                raise 105                raise 
    95    else: 106    else: 
    96        args = ' '.join(sys.argv[1:]) 107        args = ' '.join(sys.argv[1:]) 
    97        args = args.replace('-','_') 108        args = args.replace('-','_') 
    98        adminCmd.onecmd(args) 109        adminCmd.onecmd(args) 
  • trunk/src/shakespeare/cherrypy_handler.py

    Revision 1 Revision 2
    1""" 1""" 
    2Tutorial - Passing variables 2Web interface to view and analyze shakespeare texts. 
    3   
    4This tutorial shows you how to pass GET/POST variables to methods.   
    5""" 3""" 
    6import cherrypy 4import cherrypy 
    7import os 5import os 
    8 6 
    9from shakespeare.download import make_index 7import shakespeare.work 
    10index = make_index()  8index = shakespeare.work.index.all  
    11from shakespeare.utils import get_local_path 9from shakespeare.utils import get_local_path 
    12import shakespeare.format 10import shakespeare.format 
    13 11 
    14import shakespeare.concordancer 12import shakespeare.concordancer 
    15concordancer = shakespeare.concordancer.get_concordancer() 13cc = shakespeare.concordancer.get_concordancer() 
    16 14 
    17class WelcomePage: 15class WelcomePage: 
    18 16 
    19    def index(self): 17    def index(self): 
    20        try: 18        try: 
    21            import kid 19            import kid 
    22            kid.enable_import(suffixes=[".html"]) 20            kid.enable_import(suffixes=[".html"]) 
    23            import shakespeare.template.index 21            import shakespeare.template.index 
    24            template = shakespeare.template.index.Template(works_index=index) 22            template = shakespeare.template.index.Template(works_index=index) 
    25            result = str(template) 23            result = str(template) 
    26            # result = 'test' 24            # result = 'test' 
    27            return result 25            return result 
    28        except Exception, inst: 26        except Exception, inst: 
    29            return '<p><strong>There was an error: ' +  str(inst) + '</strong></p>' 27            return '<p><strong>There was an error: ' +  str(inst) + '</strong></p>' 
    30    index.exposed = True 28    index.exposed = True 
    31 29 
    32    def view(self, text_url=None, version='cleaned', format='plain'): 30    def view(self, text_url=None, version='cleaned', format='plain'): 
    33        localPath = get_local_path(text_url, version) 31        localPath = get_local_path(text_url, version) 
    34        ff = file(localPath) 32        ff = file(localPath) 
    35        if format == 'plain': 33        if format == 'plain': 
    36            result = '<pre>' + ff.read() + '</pre>' 34            result = '<pre>' + ff.read() + '</pre>' 
    37        else: 35        else: 
    38            formatter = shakespeare.format.TextFormatter(ff) 36            formatter = shakespeare.format.TextFormatter(ff) 
    39            result = formatter.format(format) 37            result = formatter.format(format) 
    40            # import kid 38            # import kid 
    41            # kid.enable_import(suffixes=['.html']) 39            # kid.enable_import(suffixes=['.html']) 
    42            # module = __import__('shakespeare.template.format_' + format, '', '', '*') 40            # module = __import__('shakespeare.template.format_' + format, '', '', '*') 
    43            # template = module.Template(fileobj=ff) 41            # template = module.Template(fileobj=ff) 
    44            # result = template.serialize() 42            # result = template.serialize() 
    45        ff.close() 43        ff.close() 
    46        return result 44        return result 
    47    view.exposed = True 45    view.exposed = True 
    48 46 
    49#    def concordance(self):  47     def concordance(self): 
    50#        import kid  48         import kid 
    51#        kid.enable_import(suffixes=[".html"])  49         kid.enable_import(suffixes=[".html"]) 
    52#        import shakespeare.template.concordance  50         import shakespeare.template.concordance 
    53#        template = shakespeare.template.concordance.Template(concordancer=concordancer)  51         concordance = cc.concordance 
    54#        result = template.serialize()  52         words = concordance.keys() 
    55#        return result  53         words.sort() 
    56#    concordance.exposed = True  54         template = shakespeare.template.concordance.Template(words=words, stats=cc.stats) 
       55         result = template.serialize() 
       56         # result = str(cc) 
       57         return result 
       58     concordance.exposed = True 
    57   59   
    58 60 
    59cherrypy.root = WelcomePage() 61cherrypy.root = WelcomePage() 
    60 62 
    61if __name__ == '__main__': 63if __name__ == '__main__': 
    62    cherrypy.lowercase_api = True 64    cherrypy.lowercase_api = True 
    63    # cherrypy.config.update(file = 'tutorial.conf') 65    # cherrypy.config.update(file = 'tutorial.conf') 
    64    cherrypy.config.update({'server.showTracebacks' : True }) 66    cherrypy.config.update({'server.showTracebacks' : True }) 
    65    cherrypy.server.start() 67    cherrypy.server.start() 
    66     68     
    67"""     69"""     
    68[global] 70[global] 
    69server.socketPort = 8080 71server.socketPort = 8080 
    70server.threadPool = 10 72server.threadPool = 10 
    71server.environment = "production" 73server.environment = "production" 
    72# server.showTracebacks = True 74# server.showTracebacks = True 
    73# server.logToScreen = False 75# server.logToScreen = False 
    74""" 76""" 
  • trunk/src/shakespeare/concordancer.py

    Revision 1 Revision 2
    1import re 1import re 
    2import cPickle 2import cPickle 
    3 3 
    4import utils 4import utils 
    5from download import make_index 5import shakespeare.work 
    6 6 
    7def make_concordancer(showProgress=True):  7 def make_concordancer( 
       8         texts_to_add=shakespeare.work.index.all, 
       9         out_path=utils.get_local_path('concordance.p'), 
       10         ): 
    8    """Create Concordancer object and use it to produce concordance and stats 11    """Create Concordancer object and use it to produce concordance and stats 
    9    for all non-folio works. 12    for all non-folio works. 
    10    Save resulting object in pickled form to 'concordance.p'.  13     @out_path: where to save the concordance 
       14     @texts_to_add: index items that should be added to the concordance 
    11    """ 15    """ 
    12    def _print(msg):   
    13        if showProgress:   
    14            print(msg)   
    15    index = make_index()   
    16    cc = Concordancer() 16    cc = Concordancer() 
    17    for item in index17    for item in texts_to_add
    18        url = item[1] 18        url = item[1] 
    19        isfolio = item[2] == 'folio' 19        isfolio = item[2] == 'folio' 
    20        src = utils.get_local_path(url, 'cleaned') 20        src = utils.get_local_path(url, 'cleaned') 
    21        if isfolio: 21        cc.add_text(file(src)) 
    22            _print('Is folio so skipping [%s]' % src) 22    ccFile = file(out_path, 'w') 
    23        else:   
    24            _print('Adding text [%s]' % src)   
    25            cc.add_text(file(src))   
    26    filePath = utils.get_local_path('concordance.p')   
    27    ccFile = file(filePath, 'w')   
    28    cPickle.dump(cc, ccFile) 23    cPickle.dump(cc, ccFile) 
    29 24 
    30def get_concordancer(): 25def get_concordancer(): 
    31    """Get a concordancer containing concordance and stats by unpickling cached 26    """Get a concordancer containing concordance and stats by unpickling cached 
    32    copy. 27    copy. 
    33    """ 28    """ 
    34    filePath = utils.get_local_path('concordance.p') 29    filePath = utils.get_local_path('concordance.p') 
    35    cc = cPickle.load(file(filePath)) 30    cc = cPickle.load(file(filePath)) 
    36    return cc 31    return cc 
    37 32 
    38class Concordancer(object): 33class Concordancer(object): 
    39    """Generate a concordance and associated statistics for a set of texts. 34    """Generate a concordance and associated statistics for a set of texts. 
    40     35     
    41    Concordance and statistics are provided as dictionaries keyed by words. 36    Concordance and statistics are provided as dictionaries keyed by words. 
    42    NB: all word keys have been lower-cased in order to render them case-insensitive 37    NB: all word keys have been lower-cased in order to render them case-insensitive 
    43    """ 38    """ 
    44 39 
    45    # multiline, unicode and ignorecase 40    # multiline, unicode and ignorecase 
    46    wordRegex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I)  41     word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 
       42  
       43     words_to_ignore = [ 'a', 'the', 'and', 
       44                         'as', 'are', 'be', 
       45                         'but', 'd', 'in' 
       46                         ] 
    47 47 
    48    def __init__(self): 48    def __init__(self): 
    49        self.concordance = {} 49        self.concordance = {} 
    50        self.stats = {} 50        self.stats = {} 
    51 51 
    52    def add_text(self, text, textId=None): 52    def add_text(self, text, textId=None): 
    53        """Add a text to the concordance. 53        """Add a text to the concordance. 
    54        @text: file like object containing text to add 54        @text: file like object containing text to add 
    55        """ 55        """ 
    56        lineCount = 0 56        lineCount = 0 
    57        charIndex = 0 57        charIndex = 0 
    58        for line in text.readlines(): 58        for line in text.readlines(): 
    59            for match in self.wordRegex.finditer(line): 59            for match in self.word_regex.finditer(line): 
    60                word = match.group().lower() # case insensitive 60                word = match.group().lower() # case insensitive 
      61                if word in self.words_to_ignore: 
      62                    continue 
    61                oldValue = self.concordance.get(word, []) 63                oldValue = self.concordance.get(word, []) 
    62                oldStat = self.stats.get(word, 0) 64                oldStat = self.stats.get(word, 0) 
    63                oldValue.append( (lineCount, charIndex + match.start()) )  65                 tloc = (textId, lineCount, charIndex + match.start())  
       66                 oldValue.append(tloc) 
    64                self.concordance[word] = oldValue 67                self.concordance[word] = oldValue 
    65                self.stats[word] = oldStat + 1 68                self.stats[word] = oldStat + 1 
    66            lineCount += 1 69            lineCount += 1 
    67            charIndex += len(line) 70            charIndex += len(line) 
    68 71 
    69   
  • trunk/src/shakespeare/concordancer_test.py

    Revision 1 Revision 2
    1import unittest 1import unittest 
    2import StringIO 2import StringIO 
    3 3 
    4import concordancer 4import concordancer 
    5 5 
    6def test_suite(): 6def test_suite(): 
    7    suites = [ 7    suites = [ 
    8        unittest.makeSuite(ConcordancerTest), 8        unittest.makeSuite(ConcordancerTest), 
    9        ] 9        ] 
    10    return unittest.TestSuite(suites) 10    return unittest.TestSuite(suites) 
    11 11 
    12class ConcordancerTest(unittest.TestCase): 12class ConcordancerTest(unittest.TestCase): 
    13 13 
    14    inText = \ 14    inText = \ 
    15"""A fake fake line 15"""A fake fake line 
    16SUFFOLK. 16SUFFOLK. 
    17As by your high imperial Majesty 17As by your high imperial Majesty 
    18I had in charge at my depart for France, 18I had in charge at my depart for France, 
    19As procurator to your excellence, 19As procurator to your excellence, 
    20""" 20""" 
      21    textId = 1 
    21     22     
    22    # ['work_id', 'line-no', 'character-index'] } 23    # ['work_id', 'line-no', 'character-index'] } 
    23    expConcordance = { 24    expConcordance = { 
    24        'fake' : [ (0, 2), (0, 7) ], 25        'fake' : [ (textId, 0, 2), (textId, 0, 7) ], 
    25        'suffolk' : [ (1, 17), ], 26        'suffolk' : [ (textId, 1, 17), ], 
    26        'high' : [ (2, 37), ], 27        'high' : [ (textId, 2, 37), ], 
    27        } 28        } 
    28 29 
    29    expStats = { 30    expStats = { 
    30        'fake' : 2, 31        'fake' : 2, 
    31        'suffolk' : 1, 32        'suffolk' : 1, 
    32        'high' : 1, 33        'high' : 1, 
    33        } 34        } 
    34 35 
    35    def setUp(self): 36    def setUp(self): 
    36        self.cc = concordancer.Concordancer() 37        self.cc = concordancer.Concordancer() 
    37        self.cc.add_text(StringIO.StringIO(self.inText), 'King Henry VI'38        self.cc.add_text(StringIO.StringIO(self.inText), self.textId
    38 39 
    39    def test__process_line(self): 40    def test__process_line(self): 
    40        line = 'the - quick, brown. fox-jumped over$ the_lazy do8g.' 41        line = 'the - quick, brown. fox-jumped over$ the_lazy do8g.' 
    41        exp = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the_lazy', 'do8g' ] 42        exp = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the_lazy', 'do8g' ] 
    42        out = self.cc.wordRegex.findall(line) 43        out = self.cc.word_regex.findall(line) 
    43        self.assertEqual(exp, out) 44        self.assertEqual(exp, out) 
    44 45 
    45    def test_concordance(self): 46    def test_concordance(self): 
    46        for key, value in self.expConcordance.items(): 47        for key, value in self.expConcordance.items(): 
    47            out = self.cc.concordance[key] 48            out = self.cc.concordance[key] 
    48            self.assertEqual(out, value) 49            self.assertEqual(out, value) 
    49 50 
    50    def test_stats(self): 51    def test_stats(self): 
    51        for key, value in self.expStats.items(): 52        for key, value in self.expStats.items(): 
    52            out = self.cc.stats[key] 53            out = self.cc.stats[key] 
    53            self.assertEqual(out, value) 54            self.assertEqual(out, value) 
      55 
      56    def test_make_concordancer(self): 
      57        import tempfile 
      58        filePath = tempfile.mkstemp()[1] 
      59        import shakespeare.work 
      60        index = shakespeare.work.index.all 
      61        concordancer.make_concordancer(index[2:3], filePath) 
  • trunk/src/shakespeare/format.py

    Revision 1 Revision 2
    1""" 1""" 
    2Clean up Gutenberg texts by removing all the header and footer bumpf 2Clean up Gutenberg texts by removing all the header and footer bumpf 
    3""" 3""" 
    4 4 
    5import re 5import re 
    6import download   
    7 6 
    8headerEndPhrases = ["Project Gutenberg's Etext of", 'This etext was prepared by'] 7headerEndPhrases = ["Project Gutenberg's Etext of", 'This etext was prepared by'] 
    9notesStartPhrases = ["Executive Director's Notes:"] 8notesStartPhrases = ["Executive Director's Notes:"] 
    10notesEndPhrases = ['David Reed'] 9notesEndPhrases = ['David Reed'] 
    11footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg' 10footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg' 
    12    ] 11    ] 
    13 12 
    14def make_re_from_phrase(phrase): 13def make_re_from_phrase(phrase): 
    15    """ 14    """ 
    16    Make a regular expression that matches a phrase and its surrounding 15    Make a regular expression that matches a phrase and its surrounding 
    17    paragraph, i.e. that look like: 16    paragraph, i.e. that look like: 
    18     17     
    19    ... phrase .... 18    ... phrase .... 
    20    more text 19    more text 
    21    [blank] 20    [blank] 
    22    [blank]+ 21    [blank]+ 
    23    """ 22    """ 
    24    paragraphText = '(.+\S.+\n)*' # need \S to ensure not just whitespace 23    paragraphText = '(.+\S.+\n)*' # need \S to ensure not just whitespace 
    25    # [[TODO: check slowdown due to inclusion of '^.*' at start 24    # [[TODO: check slowdown due to inclusion of '^.*' at start 
    26    tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+' 25    tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+' 
    27    return re.compile(tmp, re.I | re.M)  # make it case insensitive 26    return re.compile(tmp, re.I | re.M)  # make it case insensitive 
    28 27 
    29class GutenbergShakespeare(object): 28class GutenbergShakespeare(object): 
    30    """ 29    """ 
    31    Process Gutenberg shakespeare texts 30    Process Gutenberg shakespeare texts 
    32    """ 31    """ 
    33     32     
    34    def __init__(self, etext): 33    def __init__(self, etext): 
    35        """ 34        """ 
    36        @param etext: file like object containing the etext 35        @param etext: file like object containing the etext 
    37         36         
    38        Procedure: 37        Procedure: 
    39            1. strip out header and footer bumpf 38            1. strip out header and footer bumpf 
    40            2. are there notes? If so strip them out 39            2. are there notes? If so strip them out 
    41        """ 40        """ 
    42        self.etext = etext 41        self.etext = etext 
    43        self.etextStr = self.etext.read() 42        self.etextStr = self.etext.read() 
    44        # normalize the line endings to save us grief later 43        # normalize the line endings to save us grief later 
    45        self.etextStr = self.etextStr.replace('\r\n', '\n') 44        self.etextStr = self.etextStr.replace('\r\n', '\n') 
    46        self.hasNotes = False 45        self.hasNotes = False 
    47     46     
    48    def _find_max(self, phrase, string): 47    def _find_max(self, phrase, string): 
    49        maxIndex = 0 48        maxIndex = 0 
    50        regex = make_re_from_phrase(phrase) 49        regex = make_re_from_phrase(phrase) 
    51        matches = regex.finditer(string) 50        matches = regex.finditer(string) 
    52        for match in matches: 51        for match in matches: 
    53            maxIndex = max(match.end(), maxIndex) 52            maxIndex = max(match.end(), maxIndex) 
    54        return maxIndex 53        return maxIndex 
    55     54     
    56    def _find_min(self, phrase, string): 55    def _find_min(self, phrase, string): 
    57        minIndex = len(string) 56        minIndex = len(string) 
    58        regex = make_re_from_phrase(phrase) 57        regex = make_re_from_phrase(phrase) 
    59        matches = regex.finditer(string) 58        matches = regex.finditer(string) 
    60        for match in matches: 59        for match in matches: 
    61            minIndex = min(match.start(), minIndex) 60            minIndex = min(match.start(), minIndex) 
    62        return minIndex 61        return minIndex 
    63     62     
    64    def extract_text(self): 63    def extract_text(self): 
    65        """Extract the core text. 64        """Extract the core text. 
    66        """ 65        """ 
    67        self.notesEnd = self.get_notes_end() 66        self.notesEnd = self.get_notes_end() 
    68        self.headerEnd = self.get_header_end() 67        self.headerEnd = self.get_header_end() 
    69        self.footerStart = self.get_footer_start() 68        self.footerStart = self.get_footer_start() 
    70        startIndex = self.headerEnd 69        startIndex = self.headerEnd 
    71        if self.notesEnd > 0: 70        if self.notesEnd > 0: 
    72            startIndex = self.notesEnd 71            startIndex = self.notesEnd 
    73        return self.etextStr[startIndex : self.footerStart].rstrip() 72        return self.etextStr[startIndex : self.footerStart].rstrip() 
    74         73         
    75    def get_notes_end(self): 74    def get_notes_end(self): 
    76        "Return 0 if no notes" 75        "Return 0 if no notes" 
    77        indices = [ self._find_max(phrase, self.etextStr) for phrase in notesEndPhrases] 76        indices = [ self._find_max(phrase, self.etextStr) for phrase in notesEndPhrases] 
    78        index = max(indices) 77        index = max(indices) 
    79        return index 78        return index 
    80     79     
    81    def get_header_end(self): 80    def get_header_end(self): 
    82        indices = [ self._find_max(phrase, self.etextStr) for phrase in headerEndPhrases] 81        indices = [ self._find_max(phrase, self.etextStr) for phrase in headerEndPhrases] 
    83        return max(indices) 82        return max(indices) 
    84     83     
    85    def get_footer_start(self): 84    def get_footer_start(self): 
    86        indices = [ self._find_min(phrase, self.etextStr) for phrase in footerStartPhrases] 85        indices = [ self._find_min(phrase, self.etextStr) for phrase in footerStartPhrases] 
    87        return min(indices) 86        return min(indices) 
    88 87 
    89class TextFormatter(object): 88class TextFormatter(object): 
    90    """Format a provided text in a variety of ways. 89    """Format a provided text in a variety of ways. 
    91    For example: add line numbers, convert to html with line ids etc 90    For example: add line numbers, convert to html with line ids etc 
    92    """ 91    """ 
    93 92 
    94    def __init__(self, file): 93    def __init__(self, file): 
    95        """ 94        """ 
    96        @file: file-like object containing a text in plain txt 95        @file: file-like object containing a text in plain txt 
    97        """ 96        """ 
    98        self.file = file 97        self.file = file 
    99 98 
    100    def format(self, format): 99    def format(self, format): 
    101        """ 100        """ 
    102        @format: the name specifying the format to use 101        @format: the name specifying the format to use 
    103        """ 102        """ 
    104        if format == 'lineno': 103        if format == 'lineno': 
    105            return self.add_line_numbers() 104            return self.add_line_numbers() 
    106        else: 105        else: 
    107            raise ValueError('Unknown format: %s' % format) 106            raise ValueError('Unknown format: %s' % format) 
    108     107     
    109    def add_line_numbers(self): 108    def add_line_numbers(self): 
    110        result = '' 109        result = '' 
    111        count = 0 110        count = 0 
    112        for line in self.file.readlines(): 111        for line in self.file.readlines(): 
    113            tlineno = str(count).ljust(4) # assume line no < 10000 112            tlineno = str(count).ljust(4) # assume line no < 10000 
    114            result += '<pre id="%s">%s %s</pre>\n' % (count, tlineno, line.rstrip()) 113            result += '<pre id="%s">%s %s</pre>\n' % (count, tlineno, line.rstrip()) 
    115            count += 1 114            count += 1 
    116        return result 115        return result 
  • trunk/src/shakespeare/format_test.py

    Revision 1 Revision 2
    1import unittest 1import unittest 
    2 2 
    3import download 3import utils 
    4from format import make_re_from_phrase, GutenbergShakespeare  4from format import make_re_from_phrase, GutenbergShakespeare  
    5 5 
    6def test_suite(): 6def test_suite(): 
    7    suites = [ 7    suites = [ 
    8        unittest.makeSuite(FormatTest), 8        unittest.makeSuite(FormatTest), 
    9        unittest.makeSuite(GutenbergShakespeareTest), 9        unittest.makeSuite(GutenbergShakespeareTest), 
    10        ] 10        ] 
    11    return unittest.TestSuite(suites) 11    return unittest.TestSuite(suites) 
    12 12 
    13class FormatTest(unittest.TestCase): 13class FormatTest(unittest.TestCase): 
    14     14     
    15    def test_make_re_from_phrase(self): 15    def test_make_re_from_phrase(self): 
    16        outStr = """blah 16        outStr = """blah 
    17             17             
    18             18             
    19            """ 19            """ 
    20        inStr = outStr + 'All is Well that' 20        inStr = outStr + 'All is Well that' 
    21        regex = make_re_from_phrase('blah') 21        regex = make_re_from_phrase('blah') 
    22        out = regex.search(inStr) 22        out = regex.search(inStr) 
    23        self.assertEquals(out.group(), outStr) 23        self.assertEquals(out.group(), outStr) 
    24     24     
    25    def test_makeReFromPhrase2(self): 25    def test_makeReFromPhrase2(self): 
    26        outStr = """blah 26        outStr = """blah 
    27            joe 27            joe 
    28            hello 28            hello 
    29             29             
    30             30             
    31            """ 31            """ 
    32        inStr = outStr + 'All is Well that' 32        inStr = outStr + 'All is Well that' 
    33        regex = make_re_from_phrase('blah') 33        regex = make_re_from_phrase('blah') 
    34        out = regex.search(inStr) 34        out = regex.search(inStr) 
    35        self.assertEquals(out.group(), outStr) 35        self.assertEquals(out.group(), outStr) 
    36 36 
    37class GutenbergShakespeareTest(unittest.TestCase): 37class GutenbergShakespeareTest(unittest.TestCase): 
    38    etext1 = file(download.get_cache_path('0ws2510.txt')) 38    etext1 = file(utils.get_cache_path('0ws2510.txt')) 
    39    etext2 = file(download.get_cache_path('2ws2510.txt')) 39    etext2 = file(utils.get_cache_path('2ws2510.txt')) 
    40    gut1 = GutenbergShakespeare(etext1) 40    gut1 = GutenbergShakespeare(etext1) 
    41    gut2 = GutenbergShakespeare(etext2) 41    gut2 = GutenbergShakespeare(etext2) 
    42     42     
    43    def test_get_header_end(self): 43    def test_get_header_end(self): 
    44        out = self.gut1.get_header_end() 44        out = self.gut1.get_header_end() 
    45        exp = self.gut1.etextStr.index("Executive Director's Notes:") 45        exp = self.gut1.etextStr.index("Executive Director's Notes:") 
    46        self.assertEqual(out, exp) 46        self.assertEqual(out, exp) 
    47     47     
    48    def test_get_footer_start(self): 48    def test_get_footer_start(self): 
    49        out = self.gut1.get_footer_start() 49        out = self.gut1.get_footer_start() 
    50        # has no footer  50        # has no footer  
    51        exp = len(self.gut1.etextStr) 51        exp = len(self.gut1.etextStr) 
    52        self.assertEqual(out, exp) 52        self.assertEqual(out, exp) 
    53         53         
    54        out = self.gut2.get_footer_start() 54        out = self.gut2.get_footer_start() 
    55        exp = self.gut2.etextStr.index("End of Project Gutenberg Etext of As You Like It by Shakespeare") 55        exp = self.gut2.etextStr.index("End of Project Gutenberg Etext of As You Like It by Shakespeare") 
    56        self.assertEqual(out, exp) 56        self.assertEqual(out, exp) 
    57     57     
    58    def test_get_notes_end(self): 58    def test_get_notes_end(self): 
    59        out = self.gut1.get_notes_end() 59        out = self.gut1.get_notes_end() 
    60        exp = self.gut1.etextStr.index("As you Like it\n\nActus") 60        exp = self.gut1.etextStr.index("As you Like it\n\nActus") 
    61        self.assertEqual(out, exp) 61        self.assertEqual(out, exp) 
    62 62 
    63    def test_extract_text(self): 63    def test_extract_text(self): 
    64        # [[TODO: run this test on all of the etexts]] 64        # [[TODO: run this test on all of the etexts]] 
    65        for gut in [self.gut1, self.gut2]: 65        for gut in [self.gut1, self.gut2]: 
    66            out = gut.extract_text() 66            out = gut.extract_text() 
    67            notFound = (out.find('Gutenberg') == -1) 67            notFound = (out.find('Gutenberg') == -1) 
    68            self.failUnless(notFound) 68            self.failUnless(notFound) 
    69 69 
  • trunk/src/shakespeare/utils.py

    Revision 1 Revision 2
    1import os 1import os 
    2import urllib 2import urllib 
    3 3 
    4import conf 4import conf 
    5 5 
    6def get_local_path(remoteUrl, version=''): 6def get_local_path(remoteUrl, version=''): 
    7    """Get local path to text of remote url. 7    """Get local path to text of remote url. 
    8    @type: string giving version of text (''|'cleaned') 8    @type: string giving version of text (''|'cleaned') 
    9    """ 9    """ 
    10    host,path = urllib.splithost(remoteUrl) 10    host,path = urllib.splithost(remoteUrl) 
    11    name = os.path.basename(path) 11    name = os.path.basename(path) 
    12    name = version + name 12    name = version + name 
    13    localPath = get_cache_path(name) 13    localPath = get_cache_path(name) 
    14    return localPath 14    return localPath 
    15 15 
    16def download_url(url): 16def download_url(url): 
    17    localPath = get_local_path(url) 17    localPath = get_local_path(url) 
    18    urllib.urlretrieve(url, localPath) 18    urllib.urlretrieve(url, localPath) 
    19 19 
    20def get_cache_path(offset): 20def get_cache_path(offset): 
    21    "Get full path of file in cache given by offset." 21    "Get full path of file in cache given by offset." 
    22    return os.path.join(conf.CACHEDIR, offset) 22    return os.path.join(conf.CACHEDIR, offset) 
      23 
      24def download_gutenberg_index(): 
      25    "Download the Gutenberg Index file GUTINDEX.ALL." 
      26    utils.download_url(conf.GUTINDEX) 
      27 
  • trunk/src/shakespeare/work.py

    Revision 1 Revision 2
    1import os 1import os 
    2import urllib   
    3 2 
    4from utils import * 3import utils 
    5import conf 4import conf 
    6 5 
    7def download_gutenberg_index():   
    8    "Download the Gutenberg Index file GUTINDEX.ALL."   
    9    download_url(conf.GUTINDEX)   
    10 6 
    11def download_all_shakespeare(): 7class GutenbergIndex(object): 
    12    """Download from Project Gutenberg all the shakespeare texts listed in 8    """Parse the index of Gutenberg works so as to find Shakespeare works. 
    13    the index.   
    14    """ 9    """ 
    15    index = make_index() 10     
    16    for item in index11    def make_url(self, year, idStr)
    17        download_url(item[1]12        return 'http://www.gutenberg.org/dirs/etext%s/%s10.txt' % (year[2:], idStr
    18 13 
    19def make_url(year, idStr):  14     def get_shakespeare_list(self): 
    20    return 'http://www.gutenberg.org/dirs/etext%s/%s10.txt' % (year[2:], idStr)  15         """Get list of shakespeare works and urls. 
       16         Results are sorted by work title. 
       17         """ 
       18         # results have format [ title, url, comments ] 
       19         # folio in comments indicates it is a first folio 
       20         results = [ ["Sonnets", 'http://www.gutenberg.org/dirs/etext97/wssnt10.txt', ''] ] 
       21         plays = self._extract_shakespeare_works() 
       22         for play in plays: 
       23             url = self.make_url(play[1], play[2]) 
       24             results.append([play[0], url, play[3]]) 
       25         def compare_list(item1, item2): 
       26             if item1[0] > item2[0]: return 1 
       27             else: return -1 
       28         results.sort(compare_list) 
       29         return results 
       30      
       31     def _extract_shakespeare_works(self): 
       32         """Get non-copyrighted Shakespeare works from Gutenberg 
       33         Results consist of folio and one other 'standard' version. 
       34         @return: list consisting of tuples in form [title, year, id, comment] 
       35         """ 
       36         ff = file(utils.get_cache_path('GUTINDEX.ALL')) 
       37         results = [] 
       38         for line in ff.readlines(): 
       39             result = self.parse_line_for_folio(line) 
       40             if result: 
       41                 results.append(result + ['folio']) 
       42             resultNormal = self.parse_line_for_normal(line) 
       43             if resultNormal: 
       44                 results.append(resultNormal + ['']) 
       45         return results 
       46      
       47     def parse_line_for_normal(self, line): 
       48         "Parse GUTINDEX line for the 'normal' gutenberg shakespeare versions (i.e. not folio and out of copyright)." 
       49         if 'by William Shakespeare' in line and '[2' in line: 
       50             year = line[4:8] 
       51             tmp = line[9:] 
       52             endOfTitle = tmp.find(', by') 
       53             title = tmp[:endOfTitle] 
       54             startOfId = tmp.find('[2') 
       55             endOfId = tmp.find(']', startOfId) 
       56             idStr = tmp[startOfId+1:endOfId] 
       57             xstart = idStr.find('x') 
       58             idStr = idStr[:xstart] 
       59             return [title, year, idStr] 
     &