#!/usr/bin/env python '''Extension to python-markdown to support LaTeX (rather than html) output. Authored by Rufus Pollock: Usage: ====== 1. Command Line A script entitled markdown2latex.py is automatically installed. For details of usage see help:: $ markdown2latex.py -h 2. As a python-markdown extension:: import markdown md = markdown.Markdown(None, extensions=['latex']) # text is input string ... latex_out = md.convert(text) 3. Directly as a module (slight inversion of std markdown extension setup):: import markdown import mdx_latex md = markdown.Markdown() latex_mdx = mdx_latex.LaTeXExtension() latex_mdx.extendMarkdown(md, markdown.__dict__) out = md.convert(text) History ======= Version: 1.0 (November 15, 2006) * First working version (compatible with markdown 1.5) * Includes support for tables Version: 1.1 (January 17, 2007) * Support for verbatim and images Version: 1.2 (June 2008) * Refactor as an extension. * Make into a proper python/setuptools package. * Tested with markdown 1.7 but should work with 1.6 and (possibly) 1.5 (though pre/post processor stuff not as worked out there) ''' __version__ = '1.2' # do some fancy importing stuff to allow use to override things in this module # in this file while still importing * for use in our own classes import re import sys import markdown start_single_quote_re = re.compile("""(^|\s|")'""") start_double_quote_re = re.compile('''(^|\s|'|`)"''') end_double_quote_re = re.compile('"(,|\.|\s|$)') def unescape_html_entities(text): out = text.replace('&', '&') out = out.replace('<', '<') out = out.replace('>', '<') out = out.replace('"', '"') return out def escape_latex_entities(text): """Escape latex reserved characters.""" out = text out = unescape_html_entities(out) out = out.replace('%', '\\%') out = out.replace('&', '\\&') out = out.replace('#', '\\#') out = start_single_quote_re.sub('\g<1>`', out) out = start_double_quote_re.sub('\g<1>``', out) out = end_double_quote_re.sub("''\g<1>", out) # people should escape these themselves as it conflicts with maths # out = out.replace('{', '\\{') # out = out.replace('}', '\\}') # do not do '$' here because it is dealt with by convert_maths # out = out.replace('$', '\\$') return out def unescape_latex_entities(text): """Limit ourselves as this is only used for maths stuff.""" out = text out = out.replace('\\&', '&') return out def makeExtension(configs=None): return LaTeXExtension(configs=configs) class LaTeXExtension(markdown.Extension): def __init__ (self, configs=None): self.reset() def extendMarkdown(self, md, md_globals): self.md = md # remove escape pattern -- \\(.*) -- as this messes up any embedded # math and we don't need to escape stuff any more for html for pat in self.md.inlinePatterns: if pat.pattern == markdown.ESCAPE_RE: idx = self.md.inlinePatterns.index(pat) del self.md.inlinePatterns[idx] break # Insert a post-processor that would actually add the footnote div postprocessor = LaTeXPostProcessor() md.postprocessors.append(postprocessor) math_pp = MathTextPostProcessor() table_pp = TableTextPostProcessor() image_pp = ImageTextPostProcessor() unescape_html_pp = UnescapeHtmlTextPostProcessor() md.textPostprocessors.append(math_pp) md.textPostprocessors.append(table_pp) md.textPostprocessors.append(image_pp) # run last md.textPostprocessors.append(unescape_html_pp) footnote_extension = FootnoteExtension() footnote_extension.extendMarkdown(md, md_globals) def reset(self) : pass class LaTeXPostProcessor(markdown.Postprocessor): def run(self, doc): '''Walk the dom converting relevant nodes to text nodes with relevant content.''' latex_text = self.tolatex(doc.documentElement) # attach latex text as only element # have to put it in a p tag as text node for document element does not # work ... # # with stripTopLevelTags True (default) convert strips out first 23 and # last 7 chars # (this is the extra stuff added in by Markdown._transform ...) # = 6, = 7 latex_text = 'X' * 17 + latex_text # do not use p or li as they result in indentation latex_node = doc.createElement('span', latex_text) doc.appendChild(latex_node) def tolatex(self, ournode): buffer = "" subcontent = "" if ournode.type == 'text': text = escape_latex_entities(ournode.value) return text if ournode.childNodes or ournode.nodeName in ['blockquote']: for child in ournode.childNodes : subcontent += self.tolatex(child) if ournode.nodeName == 'h1': buffer += '\n\\title{%s}\n' % subcontent buffer += ''' % ---------------------------------------------------------------- \maketitle % ---------------------------------------------------------------- ''' elif ournode.nodeName == 'h2': buffer += '\n\n\\section{%s}\n' % subcontent elif ournode.nodeName == 'h3': buffer += '\n\n\\subsection{%s}\n' % subcontent elif ournode.nodeName == 'h4': buffer += '\n\\subsubsection{%s}\n' % subcontent elif ournode.nodeName == 'ul': # no need for leading \n as one will be provided by li buffer += ''' \\begin{itemize}%s \\end{itemize} ''' % subcontent elif ournode.nodeName == 'ol': # no need for leading \n as one will be provided by li buffer += ''' \\begin{enumerate}%s \\end{enumerate} ''' % subcontent elif ournode.nodeName == 'li': buffer += ''' \\item %s''' % subcontent.strip() elif ournode.nodeName == 'blockquote': # use quotation rather than quote as quotation can support multiple # paragraphs buffer += ''' \\begin{quotation} %s \\end{quotation} ''' % subcontent.strip() # ignore 'code' when inside pre tags # (mkdn produces
) elif (ournode.nodeName == 'pre' or (ournode.nodeName == 'pre' and ournode.parentNode.nodeName != 'pre')): buffer += ''' \\begin{verbatim} %s \\end{verbatim} ''' % subcontent.strip() elif ournode.nodeName == 'q': buffer += "`%s'" % subcontent.strip() elif ournode.nodeName == 'p': buffer += '\n%s\n' % subcontent.strip() # Footnote processor inserts all of the footnote in a sup tag elif ournode.nodeName == 'sup': buffer += '\\footnote{%s}' % subcontent.strip() elif ournode.nodeName == 'strong': buffer += '\\textbf{%s}' % subcontent.strip() elif ournode.nodeName == 'em': buffer += '\\emph{%s}' % subcontent.strip() else: buffer = subcontent return buffer class UnescapeHtmlTextPostProcessor(markdown.TextPostprocessor): def run(self, text): return unescape_html_entities(text) # ========================= MATHS ================================= class MathTextPostProcessor(markdown.TextPostprocessor): def run(self, instr): """Convert all math sections in {text} whether latex, asciimathml or latexmathml formatted to latex. This assumes you are using $$ as your mathematics delimiter (*not* the standard asciimathml or latexmathml delimiter). """ def repl_1(matchobj): text = unescape_latex_entities(matchobj.group(1)) tmp = text.strip() if tmp.startswith('\\[') or tmp.startswith('\\begin'): return text else: return '\\[%s\\]\n' % text def repl_2(matchobj): text = unescape_latex_entities(matchobj.group(1)) return '$%s$' % text # $$ ..... $$ pat = re.compile('^\$\$([^\$]*)\$\$\s*$', re.MULTILINE) out = pat.sub(repl_1, instr) # $100 million pat2 = re.compile('([^\$])\$([^\$])') out = pat2.sub('\g<1>\\$\g<2>', out) # Jones, $$x=3$$, is ... pat3 = re.compile('\$\$([^\$]*)\$\$') out = pat3.sub(repl_2, out) # some extras due to asciimathml out = out.replace('\\lt', '<') out = out.replace(' * ', ' \\cdot ') out = out.replace('\\del', '\\partial') return out # ========================= TABLES ================================= class TableTextPostProcessor(markdown.TextPostprocessor): def run(self, instr): """This is not very sophisticated and for it to work it is expected that: 1. tables to be in a section on their own (that is at least one blank line above and below) 2. no nesting of tables """ converter = Table2Latex() new_blocks = [] for block in instr.split("\n\n") : stripped = block.strip() # if stripped.startswith(''): latex_table = converter.convert(stripped).strip() new_blocks.append(latex_table) else : new_blocks.append(block) return '\n\n'.join(new_blocks) import xml.dom.minidom class Table2Latex: """ Convert html tables to Latex. TODO: escape latex entities. """ def colformat(self): # centre align everything by default out = '|c' * self.maxcols + '|' return out def get_text(self, element): if element.nodeType == element.TEXT_NODE: return escape_latex_entities(element.data) result = '' if element.childNodes: for child in element.childNodes : text = self.get_text(child) if text.strip() != '': result += text return result def process_cell(self, element): # works on both td and th colspan = 1 subcontent = self.get_text(element) buffer = '' if element.tagName == 'th': subcontent = '\\textbf{%s}' % subcontent if element.hasAttribute('colspan'): colspan = int(element.getAttribute('colspan')) buffer += ' \multicolumn{%s}{|c|}{%s}' % (colspan, subcontent) # we don't support rowspan because: # 1. it needs an extra latex package \usepackage{multirow} # 2. it requires us to mess around with the alignment tags in # subsequent rows (i.e. suppose the first col in row A is rowspan 2 # then in row B in the latex we will need a leading &) # if element.hasAttribute('rowspan'): # rowspan = int(element.getAttribute('rowspan')) # buffer += ' \multirow{%s}{|c|}{%s}' % (rowspan, subcontent) else: buffer += ' %s' % subcontent notLast = ( element.nextSibling and element.nextSibling.nodeType == element.ELEMENT_NODE and element.nextSibling.tagName in [ 'td', 'th' ]) if notLast: buffer += ' &' self.numcols += colspan return buffer def tolatex(self, element): if element.nodeType == element.TEXT_NODE: return '' buffer = '' subcontent = '' if element.childNodes: for child in element.childNodes : text = self.tolatex(child) if text.strip() != '': subcontent += text subcontent = subcontent.strip() if element.tagName == 'thead': buffer += '''%s ''' % subcontent elif element.tagName == 'tr': self.maxcols = max(self.numcols, self.maxcols) self.numcols = 0 buffer += '\n\\hline\n%s \\\\' % subcontent elif element.tagName == 'td' or element.tagName == 'th': buffer = self.process_cell(element) else: # print '"%s"' % subcontent buffer += subcontent return buffer def convert(self, instr): self.numcols = 0 self.maxcols = 0 dom = xml.dom.minidom.parseString(instr) core = self.tolatex(dom.documentElement) captionElements = dom.documentElement.getElementsByTagName('caption') caption = '' if captionElements: caption = self.get_text(captionElements[0]) colformatting = self.colformat() table_latex = \ ''' \\begin{table} \\begin{tabular}{%s} %s \\hline \\end{tabular} \\\\[5pt] \\caption{%s} \\end{table} ''' % (colformatting, core, caption) return table_latex # ========================= IMAGES ================================= class ImageTextPostProcessor(markdown.TextPostprocessor): def run(self, instr): """Process all img tags Similar to process_tables this is not very sophisticated and for it to work it is expected that img tags are put in a section of their own (that is separated by at least one blank line above and below). """ converter = Img2Latex() new_blocks = [] for block in instr.split("\n\n") : stripped = block.strip() #
if stripped.startswith(' as opposed to putting a reference link). ''' class FootnoteExtension (markdown.Extension): DEF_RE = re.compile(r'(\ ?\ ?\ ?)\[\^([^\]]*)\]:\s*(.*)') SHORT_USE_RE = re.compile(r'\[\^([^\]]*)\]', re.M) # [^a] def __init__ (self, configs=None): self.reset() def extendMarkdown(self, md, md_globals): self.md = md # Stateless extensions do not need to be registered md.registerExtension(self) # Insert a preprocessor before ReferencePreprocessor index = md.preprocessors.index(md_globals['REFERENCE_PREPROCESSOR']) preprocessor = FootnotePreprocessor(self) preprocessor.md = md md.preprocessors.insert(index, preprocessor) # Insert an inline pattern before ImageReferencePattern FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah index = md.inlinePatterns.index(md_globals['IMAGE_REFERENCE_PATTERN']) md.inlinePatterns.insert(index, FootnotePattern(FOOTNOTE_RE, self)) def reset(self) : self.used_footnotes={} self.footnotes = {} def setFootnote(self, id, text) : self.footnotes[id] = text class FootnotePreprocessor : def __init__ (self, footnotes) : self.footnotes = footnotes def run(self, lines) : self.blockGuru = markdown.BlockGuru() lines = self._handleFootnoteDefinitions (lines) # Make a hash of all footnote marks in the text so that we # know in what order they are supposed to appear. (This # function call doesn't really substitute anything - it's just # a way to get a callback for each occurence. text = "\n".join(lines) self.footnotes.SHORT_USE_RE.sub(self.recordFootnoteUse, text) return text.split("\n") def recordFootnoteUse(self, match) : id = match.group(1) id = id.strip() nextNum = len(self.footnotes.used_footnotes.keys()) + 1 self.footnotes.used_footnotes[id] = nextNum def _handleFootnoteDefinitions(self, lines) : """Recursively finds all footnote definitions in the lines. @param lines: a list of lines of text @returns: a string representing the text with footnote definitions removed """ i, id, footnote = self._findFootnoteDefinition(lines) if id : plain = lines[:i] detabbed, theRest = self.blockGuru.detectTabbed(lines[i+1:]) self.footnotes.setFootnote(id, footnote + "\n" + "\n".join(detabbed)) more_plain = self._handleFootnoteDefinitions(theRest) return plain + [""] + more_plain else : return lines def _findFootnoteDefinition(self, lines) : """Finds the first line of a footnote definition. @param lines: a list of lines of text @returns: the index of the line containing a footnote definition """ counter = 0 for line in lines : m = self.footnotes.DEF_RE.match(line) if m : return counter, m.group(2), m.group(3) counter += 1 return counter, None, None class FootnotePattern(markdown.Pattern): def __init__ (self, pattern, footnotes) : markdown.Pattern.__init__(self, pattern) self.footnotes = footnotes def handleMatch(self, m, doc) : sup = doc.createElement('sup') id = m.group(2) # stick the footnote text in the sup self.footnotes.md._processSection(sup, self.footnotes.footnotes[id].split("\n")) return sup def template(template_fo, latex_to_insert): tmpl = template_fo.read() tmpl = tmpl.replace('INSERT-TEXT-HERE', latex_to_insert) return tmpl # title_items = [ '\\title', '\\end{abstract}', '\\thanks', '\\author' ] # has_title_stuff = False # for it in title_items: # has_title_stuff = has_title_stuff or (it in tmpl) def main(): import optparse usage = \ '''usage: %prog [options] Given a file path, process it using markdown2latex and print the result on stdout. If using template option template should place text INSERT-TEXT-HERE in the template where text should be inserted. ''' parser = optparse.OptionParser(usage) parser.add_option('-t', '--template', dest='template', default='', help='path to latex template file (optional)', default='') (options, args) = parser.parse_args() if not len(args) > 0: parser.print_help() sys.exit(1) inpath = args[0] infile = file(inpath) md = markdown.Markdown() mkdn2latex = LaTeXExtension() mkdn2latex.extendMarkdown(md, markdown.__dict__) out = md.convert(infile.read()) if options.template: tmpl_fo = file(options.template) out = template(tmpl_fo, out) print out