Changeset 176
- Timestamp:
- 08/11/08 22:01:52 (4 months ago)
- Files:
-
- trunk/shakespeare/concordance.py (modified) (1 diff)
- trunk/shakespeare/concordance_test.py (modified) (1 diff)
- trunk/shakespeare/config/environment.py (modified) (1 diff)
- trunk/shakespeare/gutenberg.py (modified) (1 diff)
- trunk/shakespeare/gutenberg_test.py (modified) (1 diff)
- trunk/shakespeare/index.py (modified) (1 diff)
- trunk/shakespeare/moby.py (modified) (1 diff)
- trunk/shakespeare/model/dm.py (modified) (1 diff)
- trunk/shakespeare/tests/functional/test_site.py (modified) (1 diff)
- trunk/shakespeare/tests/test_model.py (modified) (1 diff)
- trunk/shakespeare/websetup.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/shakespeare/concordance.py
Revision 154 Revision 176 1 """ 1 """ 2 Concordance (and statistics) for texts in database. 2 Concordance (and statistics) for texts in database. 3 3 4 To build concordance use ConcordanceBuilder. To access concordance/statistics 4 To build concordance use ConcordanceBuilder. To access concordance/statistics 5 use Concordance/Statistics class. Concordance and statistics are provided as 5 use Concordance/Statistics class. Concordance and statistics are provided as 6 dictionaries keyed by words. 6 dictionaries keyed by words. 7 7 8 NB: all word keys have been lower-cased in order to render them 8 NB: all word keys have been lower-cased in order to render them 9 case-insensitive 9 case-insensitive 10 """ 10 """ 11 import re 11 import re 12 12 13 import sqlobject14 15 import shakespeare.index 13 import shakespeare.index 16 import shakespeare.cache 14 import shakespeare.cache 17 15 18 16 19 class ConcordanceBase(object): 17 class ConcordanceBase(object): 20 """ 18 """ 21 TODO: caching?? 19 TODO: caching?? 22 """ 20 """ 23 sqlcc = shakespeare.model.Concordance21 # sqlcc = shakespeare.model.Concordance 24 sqlstat = shakespeare.model.Statistic 22 sqlstat = shakespeare.model.Statistic 25 23 26 def __init__(self, filter_names=None): 24 def __init__(self, filter_names=None): 27 """ 25 """ 28 @param filter_names: a list of id names with which to filter results 26 @param filter_names: a list of id names with which to filter results 29 (i.e. only return results relating to those texts) 27 (i.e. only return results relating to those texts) 30 """ 28 """ 31 self._filter_names = filter_names 29 self._filter_names = filter_names 32 self.sqlcc_filter = self._make_filter(self.sqlcc) 30 self.sqlcc_filter = self._make_filter(self.sqlcc) 33 self.sqlstat_filter = self._make_filter(self.sqlstat) 31 self.sqlstat_filter = self._make_filter(self.sqlstat) 34 32 35 def _make_filter(self, sqlobj): 33 def _make_filter(self, sqlobj): 36 sql_filter = True 34 sql_filter = True 37 if self._filter_names is not None: 35 if self._filter_names is not None: 38 arglist = [] 36 arglist = [] 39 for name in self._filter_names: 37 for name in self._filter_names: 40 newarg = sqlobj.q.textID == self._name2id(name) 38 newarg = sqlobj.q.textID == self._name2id(name) 41 arglist.append(newarg) 39 arglist.append(newarg) 42 sql_filter = sqlobject.OR(*arglist) 40 sql_filter = sqlobject.OR(*arglist) 43 return sql_filter 41 return sql_filter 44 42 45 def _name2id(self, name): 43 def _name2id(self, name): 46 return shakespeare.model.Material.byName(name).id 44 return shakespeare.model.Material.byName(name).id 47 45 48 def keys(self): 46 def keys(self): 49 """Return list of *distinct* words in concordance/statistics 47 """Return list of *distinct* words in concordance/statistics 50 """ 48 """ 51 all = self.sqlstat.select(self.sqlstat_filter, 49 all = self.sqlstat.select(self.sqlstat_filter, 52 orderBy=self.sqlstat.q.word, 50 orderBy=self.sqlstat.q.word, 53 ) 51 ) 54 words = [ xx.word for xx in list(all) ] 52 words = [ xx.word for xx in list(all) ] 55 distinct = list(set(words)) 53 distinct = list(set(words)) 56 distinct.sort() 54 distinct.sort() 57 return distinct 55 return distinct 58 56 59 57 60 class Concordance(ConcordanceBase): 58 class Concordance(ConcordanceBase): 61 """Concordance by word for a set of texts 59 """Concordance by word for a set of texts 62 """ 60 """ 63 61 64 def get(self, word): 62 def get(self, word): 65 """Get list of occurrences for word 63 """Get list of occurrences for word 66 @return: sqlobject query list 64 @return: sqlobject query list 67 """ 65 """ 68 select = self.sqlcc.select(sqlobject.AND(self.sqlcc_filter, self.sqlcc.q.word==word)) 66 select = self.sqlcc.select(sqlobject.AND(self.sqlcc_filter, self.sqlcc.q.word==word)) 69 return select 67 return select 70 68 71 class Statistics(ConcordanceBase): 69 class Statistics(ConcordanceBase): 72 70 73 def get(self, word): 71 def get(self, word): 74 select = self.sqlstat.select( 72 select = self.sqlstat.select( 75 sqlobject.AND(self.sqlstat_filter, self.sqlstat.q.word==word) 73 sqlobject.AND(self.sqlstat_filter, self.sqlstat.q.word==word) 76 ) 74 ) 77 total = 0 75 total = 0 78 for stat in select: 76 for stat in select: 79 total += stat.occurrences 77 total += stat.occurrences 80 return total 78 return total 81 79 82 class ConcordanceBuilder(object): 80 class ConcordanceBuilder(object): 83 """Build a concordance and associated statistics for a set of texts. 81 """Build a concordance and associated statistics for a set of texts. 84 82 85 """ 83 """ 86 84 87 # multiline, unicode and ignorecase 85 # multiline, unicode and ignorecase 88 word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 86 word_regex = re.compile(r'\b(\w+)\b', re.U | re.M | re.I) 89 87 90 words_to_ignore = [ 88 words_to_ignore = [ 91 # 'a', 'the', 'and', 'as', 'are', 'be', 'but', 'in' 89 # 'a', 'the', 'and', 'as', 'are', 'be', 'but', 'in' 92 ] 90 ] 93 non_words = [ 91 non_words = [ 94 'd', # accus'd 92 'd', # accus'd 95 't', 93 't', 96 ] 94 ] 97 95 98 def is_roman_numeral(self, word): 96 def is_roman_numeral(self, word): 99 digits = [ 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix' ] 97 digits = [ 'i', 'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix' ] 100 others = [ 'l', 'x', 'c' ] 98 others = [ 'l', 'x', 'c' ] 101 if word == 'i': return False # exception because this conflicts with I 99 if word == 'i': return False # exception because this conflicts with I 102 while word[0] in others: 100 while word[0] in others: 103 if len(word) == 1: 101 if len(word) == 1: 104 return True 102 return True 105 else: 103 else: 106 word = word[1:] 104 word = word[1:] 107 return word in digits 105 return word in digits 108 106 109 def ignore_word(self, word): 107 def ignore_word(self, word): 110 "Return True if this word should not be added to the concordance." 108 "Return True if this word should not be added to the concordance." 111 bool1 = word in self.words_to_ignore 109 bool1 = word in self.words_to_ignore 112 bool2 = word in self.non_words 110 bool2 = word in self.non_words 113 # do roman numerals 111 # do roman numerals 114 bool3 = self.is_roman_numeral(word) 112 bool3 = self.is_roman_numeral(word) 115 return bool1 or bool2 or bool3 113 return bool1 or bool2 or bool3 116 114 117 def _text_already_done(self, text): 115 def _text_already_done(self, text): 118 numrecs = shakespeare.model.Concordance.select( 116 numrecs = shakespeare.model.Concordance.select( 119 shakespeare.model.Concordance.q.textID==text.id 117 shakespeare.model.Concordance.q.textID==text.id 120 ).count() 118 ).count() 121 return numrecs > 0 119 return numrecs > 0 122 120 123 def add_text(self, name, text=None): 121 def add_text(self, name, text=None): 124 """Add a text to the concordance. 122 """Add a text to the concordance. 125 @param name: name of text to add 123 @param name: name of text to add 126 @param text: [optional] a file-like object containing text data. If not 124 @param text: [optional] a file-like object containing text data. If not 127 provided will default to using file in cache associated with named 125 provided will default to using file in cache associated with named 128 text 126 text 129 """ 127 """ 130 dmText = shakespeare.model.Material.byName(name) 128 dmText = shakespeare.model.Material.byName(name) 131 if self._text_already_done(dmText): 129 if self._text_already_done(dmText): 132 msg = 'Have already added to concordance text: %s' % dmText 130 msg = 'Have already added to concordance text: %s' % dmText 133 # raise ValueError(msg) 131 # raise ValueError(msg) 134 print msg 132 print msg 135 print 'Skipping' 133 print 'Skipping' 136 return 134 return 137 if text is None: 135 if text is None: 138 tpath = dmText.get_text() 136 tpath = dmText.get_text() 139 text = file(tpath) 137 text = file(tpath) 140 lineCount = 0 138 lineCount = 0 141 charIndex = 0 139 charIndex = 0 142 stats = {} 140 stats = {} 143 trans = shakespeare.model.Concordance._connection.transaction() 141 trans = shakespeare.model.Concordance._connection.transaction() 144 for line in text.readlines(): 142 for line in text.readlines(): 145 for match in self.word_regex.finditer(line): 143 for match in self.word_regex.finditer(line): 146 word = match.group().lower() # case insensitive 144 word = match.group().lower() # case insensitive 147 if self.ignore_word(word): 145 if self.ignore_word(word): 148 continue 146 continue 149 shakespeare.model.Concordance(connection=trans, 147 shakespeare.model.Concordance(connection=trans, 150 text=dmText, 148 text=dmText, 151 word=word, 149 word=word, 152 line=lineCount, 150 line=lineCount, 153 char_index=charIndex+match.start()) 151 char_index=charIndex+match.start()) 154 stats[word] = stats.get(word, 0) + 1 152 stats[word] = stats.get(word, 0) + 1 155 lineCount += 1 153 lineCount += 1 156 charIndex += len(line) 154 charIndex += len(line) 157 trans.commit() 155 trans.commit() 158 trans = shakespeare.model.Concordance._connection.transaction() 156 trans = shakespeare.model.Concordance._connection.transaction() 159 for word, value in stats.items(): 157 for word, value in stats.items(): 160 tresults = shakespeare.model.Statistic.select( 158 tresults = shakespeare.model.Statistic.select( 161 sqlobject.AND( 159 sqlobject.AND( 162 shakespeare.model.Statistic.q.textID == dmText.id, 160 shakespeare.model.Statistic.q.textID == dmText.id, 163 shakespeare.model.Statistic.q.word == word 161 shakespeare.model.Statistic.q.word == word 164 )) 162 )) 165 try: 163 try: 166 dbstat = list(tresults)[0] 164 dbstat = list(tresults)[0] 167 dbstat.occurrences += value 165 dbstat.occurrences += value 168 except: 166 except: 169 shakespeare.model.Statistic( 167 shakespeare.model.Statistic( 170 connection=trans, 168 connection=trans, 171 text=dmText, 169 text=dmText, 172 word=word, 170 word=word, 173 occurrences=value 171 occurrences=value 174 ) 172 ) 175 trans.commit() 173 trans.commit() 176 174 177 175 178 def remove_text(self, name): 176 def remove_text(self, name): 179 """Remove a text from the concordance. 177 """Remove a text from the concordance. 180 178 181 @param name: as for add_text 179 @param name: as for add_text 182 """ 180 """ 183 dmText = shakespeare.model.Material.byName(name) 181 dmText = shakespeare.model.Material.byName(name) 184 recs = shakespeare.model.Concordance.select( 182 recs = shakespeare.model.Concordance.select( 185 shakespeare.model.Concordance.q.textID==dmText.id 183 shakespeare.model.Concordance.q.textID==dmText.id 186 ) 184 ) 187 for rec in recs: 185 for rec in recs: 188 shakespeare.model.Concordance.delete(rec.id) 186 shakespeare.model.Concordance.delete(rec.id) 189 stats = shakespeare.model.Statistic.select( 187 stats = shakespeare.model.Statistic.select( 190 shakespeare.model.Statistic.q.textID==dmText.id 188 shakespeare.model.Statistic.q.textID==dmText.id 191 ) 189 ) 192 for stat in stats: 190 for stat in stats: 193 shakespeare.model.Statistic.delete(stat.id) 191 shakespeare.model.Statistic.delete(stat.id) 194 192 trunk/shakespeare/concordance_test.py
Revision 150 Revision 176 1 import unittest 1 import unittest 2 import StringIO 2 import StringIO 3 import tempfile 3 import tempfile 4 4 5 5 6 import shakespeare.index 6 import shakespeare.index 7 import shakespeare.concordance 7 import shakespeare.concordance 8 8 9 class TestConcordancer: 9 # Disable in preparation for removal 10 class _TestConcordancer: 10 11 11 inText = \ 12 inText = \ 12 """A fake fake line 13 """A fake fake line 13 SUFFOLK. 14 SUFFOLK. 14 As by your high imperial Majesty 15 As by your high imperial Majesty 15 I had in charge at my depart for France, 16 I had in charge at my depart for France, 16 As procurator to your excellence, 17 As procurator to your excellence, 17 A fake imperial line. 18 A fake imperial line. 18 """ 19 """ 19 name = 'test-concordance' 20 name = 'test-concordance' 20 title = 'Hamlet' 21 title = 'Hamlet' 21 22 22 # ['work_id', 'line-no', 'character-index'] } 23 # ['work_id', 'line-no', 'character-index'] } 23 # incomplete 24 # incomplete 24 expConcordance = { 25 expConcordance = { 25 'fake' : [ (name, 0, 2), (name, 0, 7), (name, 5, 136) ], 26 'fake' : [ (name, 0, 2), (name, 0, 7), (name, 5, 136) ], 26 'suffolk' : [ (name, 1, 17), ], 27 'suffolk' : [ (name, 1, 17), ], 27 'high' : [ (name, 2, 37), ], 28 'high' : [ (name, 2, 37), ], 28 'word_that_is_not_there' : [], 29 'word_that_is_not_there' : [], 29 } 30 } 30 31 31 # incomplete 32 # incomplete 32 expStats = { 33 expStats = { 33 'fake' : 3, 34 'fake' : 3, 34 'imperial' : 2, 35 'imperial' : 2, 35 'suffolk' : 1, 36 'suffolk' : 1, 36 'high' : 1, 37 'high' : 1, 37 'word_that_is_not_there' : 0, 38 'word_that_is_not_there' : 0, 38 } 39 } 39 40 40 @classmethod 41 @classmethod 41 def setup_class(cls): 42 def setup_class(cls): 42 cls.builder = shakespeare.concordance.ConcordanceBuilder() 43 cls.builder = shakespeare.concordance.ConcordanceBuilder() 43 # try deleting it first so as to be more robust to errors 44 # try deleting it first so as to be more robust to errors 44 # does not seem to work with the class methods 45 # does not seem to work with the class methods 45 # cls.teardown_class(cls) 46 # cls.teardown_class(cls) 46 cls.text = shakespeare.model.Material(name=cls.name, title=cls.title) 47 cls.text = shakespeare.model.Material(name=cls.name, title=cls.title) 47 cls.builder.add_text(cls.name, StringIO.StringIO(cls.inText)) 48 cls.builder.add_text(cls.name, StringIO.StringIO(cls.inText)) 48 cls.concordance = shakespeare.concordance.Concordance([cls.name]) 49 cls.concordance = shakespeare.concordance.Concordance([cls.name]) 49 cls.statistics = shakespeare.concordance.Statistics([cls.name]) 50 cls.statistics = shakespeare.concordance.Statistics([cls.name]) 50 51 51 @classmethod 52 @classmethod 52 def teardown_class(cls): 53 def teardown_class(cls): 53 # allow us to deal with left over stuff from previous errors 54 # allow us to deal with left over stuff from previous errors 54 try: 55 try: 55 cls.builder.remove_text(cls.name) 56 cls.builder.remove_text(cls.name) 56 tmp = shakespeare.model.Material.byName(cls.name) 57 tmp = shakespeare.model.Material.byName(cls.name) 57 shakespeare.model.Material.delete(tmp.id) 58 shakespeare.model.Material.delete(tmp.id) 58 except: 59 except: 59 pass 60 pass 60 61 61 def test__process_line(self): 62 def test__process_line(self): 62 line = 'the - quick, brown. fox-jumped over$ the_lazy do8g.' 63 line = 'the - quick, brown. fox-jumped over$ the_lazy do8g.' 63 exp = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the_lazy', 'do8g' ] 64 exp = ['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the_lazy', 'do8g' ] 64 out = self.builder.word_regex.findall(line) 65 out = self.builder.word_regex.findall(line) 65 assert exp == out 66 assert exp == out 66 67 67 def test_is_roman_numeral(self): 68 def test_is_roman_numeral(self): 68 testvals = [ 'ii', 'v', 'vi', 'xi', 'xx', 'xxi', 'xlvi', 'c', 'cvi' ] 69 testvals = [ 'ii', 'v', 'vi', 'xi', 'xx', 'xxi', 'xlvi', 'c', 'cvi' ] 69 for val in testvals: 70 for val in testvals: 70 assert self.builder.is_roman_numeral(val) 71 assert self.builder.is_roman_numeral(val) 71 72 72 def test_ignore_word(self): 73 def test_ignore_word(self): 73 testvals = [ 'd', 't' ] 74 testvals = [ 'd', 't' ] 74 for val in testvals: 75 for val in testvals: 75 assert self.builder.ignore_word(val) 76 assert self.builder.ignore_word(val) 76 77 77 def test_concordance(self): 78 def test_concordance(self): 78 for key, value in self.expConcordance.items(): 79 for key, value in self.expConcordance.items(): 79 listing = list(self.concordance.get(key)) 80 listing = list(self.concordance.get(key)) 80 assert len(listing) == len(value) 81 assert len(listing) == len(value) 81 for xx in listing: 82 for xx in listing: 82 assert (xx.text.name, xx.line, xx.char_index) in value 83 assert (xx.text.name, xx.line, xx.char_index) in value 83 84 84 def test_stats(self): 85 def test_stats(self): 85 for key, value in self.expStats.items(): 86 for key, value in self.expStats.items(): 86 out = self.statistics.get(key) 87 out = self.statistics.get(key) 87 print key 88 print key 88 assert out == value 89 assert out == value 89 90 90 def test_keys(self): 91 def test_keys(self): 91 words = self.concordance.keys() 92 words = self.concordance.keys() 92 assert 'a' == words[0] 93 assert 'a' == words[0] 93 assert 'your' == words[-1] 94 assert 'your' == words[-1] 94 assert 22 == len(words) 95 assert 22 == len(words) trunk/shakespeare/config/environment.py
Revision 148 Revision 176 1 """Pylons environment configuration""" 1 """Pylons environment configuration""" 2 import os 2 import os 3 4 from sqlalchemy import engine_from_config 3 5 4 from pylons import config 6 from pylons import config 5 7 6 import shakespeare.lib.app_globals as app_globals 8 import shakespeare.lib.app_globals as app_globals 7 import shakespeare.lib.helpers 9 import shakespeare.lib.helpers 8 from shakespeare.config.routing import make_map 10 from shakespeare.config.routing import make_map 9 11 10 def load_environment(global_conf, app_conf): 12 def load_environment(global_conf, app_conf): 11 """Configure the Pylons environment via the ``pylons.config`` 13 """Configure the Pylons environment via the ``pylons.config`` 12 object 14 object 13 """ 15 """ 14 # Pylons paths 16 # Pylons paths 15 root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 16 paths = dict(root=root, 18 paths = dict(root=root, 17 controllers=os.path.join(root, 'controllers'), 19 controllers=os.path.join(root, 'controllers'), 18 static_files=os.path.join(root, 'public'), 20 static_files=os.path.join(root, 'public'), 19 templates=[os.path.join(root, 'templates')]) 21 templates=[os.path.join(root, 'templates')]) 20 22 21 # Initialize config with the basic options 23 # Initialize config with the basic options 22 config.init_app(global_conf, app_conf, package='shakespeare', 24 config.init_app(global_conf, app_conf, package='shakespeare', 23 template_engine='genshi', paths=paths) 25 template_engine='genshi', paths=paths) 24 26 25 config['routes.map'] = make_map() 27 config['routes.map'] = make_map() 26 config['pylons.g'] = app_globals.Globals() 28 config['pylons.g'] = app_globals.Globals() 27 config['pylons.h'] = shakespeare.lib.helpers 29 config['pylons.h'] = shakespeare.lib.helpers 28 30 29 # Customize templating options via this variable 31 # Customize templating options via this variable 30 tmpl_options = config['buffet.template_options'] 32 tmpl_options = config['buffet.template_options'] 31 33 32 # CONFIGURATION OPTIONS HERE (note: all config options will override 34 # CONFIGURATION OPTIONS HERE (note: all config options will override 33 # any Pylons config options) 35 # any Pylons config options) 36 config['pylons.g'].sa_engine = engine_from_config(config, 'sqlalchemy.') trunk/shakespeare/gutenberg.py
Revision 150 Revision 176 1 """Various useful functionality related to Project Gutenberg 1 """Various useful functionality related to Project Gutenberg 2 """ 2 """ 3 import os 3 import os 4 import StringIO 4 import StringIO 5 import shakespeare.cache 5 import shakespeare.cache 6 6 7 7 8 class GutenbergIndex(object): 8 class GutenbergIndex(object): 9 """Parse the index of Gutenberg works so as to find Shakespeare works. 9 """Parse the index of Gutenberg works so as to find Shakespeare works. 10 10 11 TODO: Gutenberg now make available the index in RDF/XML form: 11 TODO: Gutenberg now make available the index in RDF/XML form: 12 http://www.gutenberg.org/feeds/catalog.rdf.bz2 and we should try to use 12 http://www.gutenberg.org/feeds/catalog.rdf.bz2 and we should try to use 13 that instead of plain text file 13 that instead of plain text file 14 """ 14 """ 15 15 16 # url for the Gutenberg index file 16 # url for the Gutenberg index file 17 gutindex = 'http://www.gutenberg.org/dirs/GUTINDEX.ALL' 17 gutindex = 'http://www.gutenberg.org/dirs/GUTINDEX.ALL' 18 18 19 def __init__(self): 19 def __init__(self): 20 self.download_gutenberg_index() 20 self.download_gutenberg_index() 21 self._gutindex_local_path = shakespeare.cache.default.path(self.gutindex) 21 self._gutindex_local_path = shakespeare.cache.default.path(self.gutindex) 22 22 23 def download_gutenberg_index(self): 23 def download_gutenberg_index(self): 24 """Download the Gutenberg Index file GUTINDEX.ALL to cache if we don't 24 """Download the Gutenberg Index file GUTINDEX.ALL to cache if we don't 25 have it already. 25 have it already. 26 """ 26 """ 27 shakespeare.cache.default.download_url(self.gutindex) 27 shakespeare.cache.default.download_url(self.gutindex) 28 28 29 def make_url(self, year, idStr): 29 def make_url(self, year, idStr): 30 return 'http://www.gutenberg.org/dirs/etext%s/%s10.txt' % (year[2:], idStr) 30 return 'http://www.gutenberg.org/dirs/etext%s/%s10.txt' % (year[2:], idStr) 31 31 32 def get_shakespeare_list(self): 32 def get_shakespeare_list(self): 33 """Get list of shakespeare works and urls. 33 """Get list of shakespeare works and urls. 34 34 35 Results are sorted by work title. 35 Results are sorted by work title. 36 36 37 Notes regarding list of plays: 37 Notes regarding list of plays: 38 38 39 * no Folio edition of Troilus and Cressida 39 * no Folio edition of Troilus and Cressida 40 * no Folio edition of Pericles 40 * no Folio edition of Pericles 41 """ 41 """ 42 # results have format [ title, url, comments ] 42 # results have format [ title, url, comments ] 43 # folio in comments indicates it is a first folio 43 # folio in comments indicates it is a first folio 44 results = [ ["Sonnets", 'http://www.gutenberg.org/dirs/etext97/wssnt10.txt', ''] ] 44 results = [ ["Sonnets", 'http://www.gutenberg.org/dirs/etext97/wssnt10.txt', ''] ] 45 plays = self._extract_shakespeare_works() 45 plays = self._extract_shakespeare_works() 46 for play in plays: 46 for play in plays: 47 url = self.make_url(play[1], play[2]) 47 url = self.make_url(play[1], play[2]) 48 results.append([play[0], url, play[3]]) 48 results.append([play[0], url, play[3]]) 49 # add in by hand some exceptions 49 # add in by hand some exceptions 50 results.append(["The Winter's Tale", 50 results.append(["The Winter's Tale", 51 'http://www.gutenberg.org/files/1539/1539.txt', ''] 51 'http://www.gutenberg.org/files/1539/1539.txt', ''] 52 ) 52 ) 53 def compare_list(item1, item2): 53 def compare_list(item1, item2): 54 if item1[0] > item2[0]: return 1 54 if item1[0] > item2[0]: return 1 55 else: return -1 55 else: return -1 56 results.sort(compare_list) 56 results.sort(compare_list) 57 return results 57 return results 58 58 59 def _extract_shakespeare_works(self): 59 def _extract_shakespeare_works(self): 60 """Get non-copyrighted Shakespeare works from Gutenberg 60 """Get non-copyrighted Shakespeare works from Gutenberg 61 Results consist of folio and one other 'standard' version. 61 Results consist of folio and one other 'standard' version. 62 @return: list consisting of tuples in form [title, year, id, comment] 62 @return: list consisting of tuples in form [title, year, id, comment] 63 """ 63 """ 64 ff = file(self._gutindex_local_path) 64 ff = file(self._gutindex_local_path) 65 results = [] 65 results = [] 66 for line in ff.readlines(): 66 for line in ff.readlines(): 67 result = self.parse_line_for_folio(line) 67 result = self.parse_line_for_folio(line) 68 if result: 68 if result: 69 results.append(result + ['folio']) 69 results.append(result + ['folio']) 70 resultNormal = self.parse_line_for_normal(line) 70 resultNormal = self.parse_line_for_normal(line) 71 if resultNormal: 71 if resultNormal: 72 results.append(resultNormal + ['']) 72 results.append(resultNormal + ['']) 73 return results 73 return results 74 74 75 def parse_line_for_normal(self, line): 75 def parse_line_for_normal(self, line): 76 """Parse GUTINDEX for 'normal' gutenberg shakespeare versions (i.e. not 76 """Parse GUTINDEX for 'normal' gutenberg shakespeare versions (i.e. not 77 folio and out of copyright). 77 folio and out of copyright). 78 """ 78 """ 79 # normal shakespeare are those with id starting [2 79 # normal shakespeare are those with id starting [2 80 # most have 'by William Shakespeare' but also have 'by Shakespeare' 80 # most have 'by William Shakespeare' but also have 'by Shakespeare' 81 # (Othello) and 'by Wm Shakespeare' (Titus Andronicus) 81 # (Othello) and 'by Wm Shakespeare' (Titus Andronicus) 82 # everything is by William Shakespeare except for Othello 82 # everything is by William Shakespeare except for Othello 83 if ('Shakespeare' in line and '[2' in line 83 if ('Shakespeare' in line and '[2' in line 84 and 'mp3' not in line and 'Apocrypha' not in line): 84 and 'mp3' not in line and 'Apocrypha' not in line): 85 year = line[4:8] 85 year = line[4:8] 86 tmp = line[9:] 86 tmp = line[9:] 87 endOfTitle = tmp.find(', by') 87 endOfTitle = tmp.find(', by') 88 title = tmp[:endOfTitle] 88 title = tmp[:endOfTitle] 89 startOfId = tmp.find('[2') 89 startOfId = tmp.find('[2') 90 endOfId = tmp.find(']', startOfId) 90 endOfId = tmp.find(']', startOfId) 91 idStr = tmp[startOfId+1:endOfId] 91 idStr = tmp[startOfId+1:endOfId] 92 xstart = idStr.find('x') 92 xstart = idStr.find('x') 93 idStr = idStr[:xstart] 93 idStr = idStr[:xstart] 94 return [title, year, idStr] 94 return [title, year, idStr] 95 95 96 def parse_line_for_folio(self, line): 96 def parse_line_for_folio(self, line): 97 if '[FF]' in line: 97 if '[FF]' in line: 98 year = line[4:8] 98 year = line[4:8] 99 tmp = line[9:] 99 tmp = line[9:] 100 endOfTitle = tmp.find(', by') 100 endOfTitle = tmp.find(', by') 101 title = tmp[:endOfTitle] 101 title = tmp[:endOfTitle] 102 startOfId = tmp.find('[FF]') + 5 102 startOfId = tmp.find('[FF]') + 5 103 endOfId = tmp.find(']', startOfId) 103 endOfId = tmp.find(']', startOfId) 104 idStr = tmp[startOfId+1:endOfId] 104 idStr = tmp[startOfId+1:endOfId] 105 xstart = idStr.find('x') 105 xstart = idStr.find('x') 106 idStr = idStr[:xstart] 106 idStr = idStr[:xstart] 107 return [title, year, idStr] 107 return [title, year, idStr] 108 else: 108 else: 109 return None 109 return None 110 110 111 111 112 """ 112 """ 113 Clean up Gutenberg texts by removing all the header and footer bumpf 113 Clean up Gutenberg texts by removing all the header and footer bumpf 114 """ 114 """ 115 115 116 import re 116 import re 117 117 118 headerEndPhrases = ["Project Gutenberg's Etext of", 'This etext was prepared by'] 118 headerEndPhrases = ["Project Gutenberg's Etext of", 'This etext was prepared by'] 119 notesStartPhrases = ["Executive Director's Notes:"] 119 notesStartPhrases = ["Executive Director's Notes:"] 120 notesEndPhrases = ['David Reed'] 120 notesEndPhrases = ['David Reed'] 121 footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg' 121 footerStartPhrases = ['End of Project Gutenberg', 'End of The Project Gutenberg' 122 ] 122 ] 123 123 124 def make_re_from_phrase(phrase): 124 def make_re_from_phrase(phrase): 125 """ 125 """ 126 Make a regular expression that matches a phrase and its surrounding 126 Make a regular expression that matches a phrase and its surrounding 127 paragraph, i.e. that look like: 127 paragraph, i.e. that look like: 128 128 129 ... phrase .... 129 ... phrase .... 130 more text 130 more text 131 [blank] 131 [blank] 132 [blank]+ 132 [blank]+ 133 """ 133 """ 134 paragraphText = '(.+\S.+\n)*' # need \S to ensure not just whitespace 134 paragraphText = '(.+\S.+\n)*' # need \S to ensure not just whitespace 135 # [[TODO: check slowdown due to inclusion of '^.*' at start 135 # [[TODO: check slowdown due to inclusion of '^.*' at start 136 tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+' 136 tmp = '^.*' + phrase + '.*\n' + paragraphText + '\s+' 137 return re.compile(tmp, re.I | re.M) # make it case insensitive 137 return re.compile(tmp, re.I | re.M) # make it case insensitive 138 138 139 class GutenbergShakespeare(object): 139 class GutenbergShakespeare(object): 140 """ 140 """ 141 Process Gutenberg shakespeare texts 141 Process Gutenberg shakespeare texts 142 """ 142 """ 143 143 144 def __init__(self, etext): 144 def __init__(self, etext): 145 """ 145 """ 146 @param etext: file like object containing the etext 146 @param etext: file like object containing the etext 147 147 148 Procedure: 148 Procedure: 149 1. strip out header and footer bumpf 149 1. strip out header and footer bumpf 150 2. are there notes? If so strip them out 150 2. are there notes? If so strip them out 151 """ 151 """ 152 self.etext = etext 152 self.etext = etext 153 # most shakespeare texts are either ascii or latin-1 153 # most shakespeare texts are either ascii or latin-1 154 self.etextStr = unicode(self.etext.read(), 'latin-1').encode('utf-8') 154 self.etextStr = unicode(self.etext.read(), 'latin-1').encode('utf-8') 155 # normalize the line endings to save us grief later 155 # normalize the line endings to save us grief later 156 self.etextStr = self.etextStr.replace('\r\n', '\n') 156 self.etextStr = self.etextStr.replace('\r\n', '\n') 157 self.hasNotes = False 157 self.hasNotes = False 158 158 159 def _find_max(self, phrase, string): 159 def _find_max(self, phrase, string): 160 maxIndex = 0 160 maxIndex = 0 161 regex = make_re_from_phrase(phrase) 161 regex = make_re_from_phrase(phrase) 162 matches = regex.finditer(string) 162 matches = regex.finditer(string) 163 for match in matches: 163 for match in matches: 164 maxIndex = max(match.end(), maxIndex) 164 maxIndex = max(match.end(), maxIndex) 165 return maxIndex 165 return maxIndex 166 166 167 def _find_min(self, phrase, string): 167 def _find_min(self, phrase, string): 168 minIndex = len(string) 168 minIndex = len(string) 169 regex = make_re_from_phrase(phrase) 169 regex = make_re_from_phrase(phrase) 170 matches = regex.finditer(string) 170 matches = regex.finditer(string) 171 for match in matches: 171 for match in matches: 172 minIndex = min(match.start(), minIndex) 172 minIndex = min(match.start(), minIndex) 173 return minIndex 173 return minIndex 174 174 175 def extract_text(self): 175 def extract_text(self): 176 """Extract the core text. 176 """Extract the core text. 177 """ 177 """ 178 self.notesEnd = self.get_notes_end() 178 self.notesEnd = self.get_notes_end() 179 self.headerEnd = self.get_header_end() 179 self.headerEnd = self.get_header_end() 180 self.footerStart = self.get_footer_start() 180 self.footerStart = self.get_footer_start() 181 startIndex = self.headerEnd 181 startIndex = self.headerEnd 182 if self.notesEnd > 0: 182 if self.notesEnd > 0: 183 startIndex = self.notesEnd 183 startIndex = self.notesEnd 184 return self.etextStr[startIndex : self.footerStart].rstrip() 184 return self.etextStr[startIndex : self.footerStart].rstrip() 185 185 186 def get_notes_end(self): 186 def get_notes_end(self): 187 "Return 0 if no notes" 187 "Return 0 if no notes" 188 indices = [ self._find_max(phrase, self.etextStr) for phrase in notesEndPhrases] 188 indices = [ self._find_max(phrase, self.etextStr) for phrase in notesEndPhrases] 189 index = max(indices) 189 index = max(indices) 190 return index 190 return index 191 191 192 def get_header_end(self): 192 def get_header_end(self): 193 indices = [ self._find_max(phrase, self.etextStr) for phrase in headerEndPhrases] 193 indices = [ self._find_max(phrase, self.etextStr) for phrase in headerEndPhrases] 194 return max(indices) 194 return max(indices) 195 195 196 def get_footer_start(self): 196 def get_footer_start(self): 197 indices = [ self._find_min(phrase, self.etextStr) for phrase in footerStartPhrases] 197 indices = [ self._find_min(phrase, self.etextStr) for phrase in footerStartPhrases] 198 return min(indices) 198 return min(indices) 199 199 200 200 201 #def get_etext_url(number): 201 #def get_etext_url(number): 202 # """ 202 # """ 203 # [[TODO: DOES NOT WORK]] 203 # [[TODO: DOES NOT WORK]] 204 # Get the url for an etext given its number. 204 # Get the url for an etext given its number. 205 # This is non-trivial and follows instructions at start of GUTINDEX.ALL 205 # This is non-trivial and follows instructions at start of GUTINDEX.ALL 206 # """ 206 # """ 207 # baseUrl = 'http://www.gutenberg.org/dirs/' 207 # baseUrl = 'http://www.gutenberg.org/dirs/' 208 # ss = '' 208 # ss = '' 209 # if number > 10000: 209 # if number > 10000: 210 # ss = str(number) 210 # ss = str(number) 211 # for char in ss[:-1]: 211 # for char in ss[:-1]: 212 # pass 212 # pass 213 # if number <= 10000: 213 # if number <= 10000: 214 # raise 'Cannot deal with etext numbers less than 10000' 214 # raise 'Cannot deal with etext numbers less than 10000' 215 # return ss 215 # return ss 216 216 217 217 218 class Helper(object): 218 class Helper(object): 219 219 220 def __init__(self, verbose=False): 220 def __init__(self, verbose=False): 221 self.verbose = verbose 221 self.verbose = verbose 222 gutindex = GutenbergIndex() 222 gutindex = GutenbergIndex() 223 self._index = gutindex.get_shakespeare_list() 223 self._index = gutindex.get_shakespeare_list() 224 224 225 def _filter_index(self, line): 225 def _filter_index(self, line): 226 """Filter items in index return only those whose id (url) is in line. 226 """Filter items in index return only those whose id (url) is in line. 227 If line is empty or None return all items 227 If line is empty or None return all items 228 """ 228 """ 229 if line: 229 if line: 230 textsToAdd = [] 230 textsToAdd = [] 231 textsUrls = line.split() 231 textsUrls = line.split() 232 for item in self._index: 232 for item in self._index: 233 if item[1] in textsUrls: 233 if item[1] in textsUrls: 234 textsToAdd.append(item) 234 textsToAdd.append(item) 235 return textsToAdd 235 return textsToAdd 236 else: 236 else: 237 return self._index 237 return self._index 238 238 239 def execute(self, line=None): 239 def execute(self, line=None): 240 self.download(line) 240 self.download(line) 241 self.clean(line) 241 self.clean(line) 242 self.add_to_db() 242 self.add_to_db() 243 243 <
