#!/usr/bin/python """ NAME ==== texi2rest - Convert texinfo xml represenation to reStructuredText SYNOPSIS ======== texi2rest *xmlfile* > *restfile* DESCRIPTION =========== ``texi2rest`` is based on ``xhtml2rest`` by Antonios Christofides. He included the following disclaimer for his program, and it applies equally to mine: far from being a decent and complete program, this is only something to begin with, which hopefully processes the given UTF-8 texinfo xml file and produces reStructuredText "source code" in the standard output. Before processing the texinfo file, you must convert it to xml using the makeinfo command: makeinfo --xml *texifile* texi2rest *xmlfile* > *restfile* LIMITATIONS =========== I am writing this specifically to convert the XaoS project's documentation. I do not plan to implement full conversion of every texinfo directive--only the ones used in the documentation I am trying to convert. Hopefully other interested parties will contribute further improvements. META ==== ``texi2rest`` was hacked together by J.B. Langston, jb-langston@austin.rr.com, based on ``xhtml2rest`` by Antonios Christofides, anthony@itia.ntua.gr. Revision: $Revision: 3753 $ The code and this text is hereby placed in the public domain. """ import xml.dom.minidom import re import sys import textwrap import math import UserList import warnings import codecs ############################################################################### # Configuration: these values change the behavior of the conversion # Texinfo commands that generate emphasis markup (i.e., *text*) EMPHASIS_COMMANDS = ('emph', 'i', 'slanted', 'var') # Texinfo commands that generate strong markup (i.e., **text**) STRONG_COMMANDS = ('strong', 'b') # Texinfo commands that generate literal markup (i.e.,``text``) LITERAL_COMMANDS = ('code', 'verb' 'tt') # Texinfo commands that map to reST roles of the same name (i.e., :role:`text`) VERBATIM_COMMANDS = ('dfn', 'file', 'command', 'option', 'kbd', 'samp', 'math') # Texinfo commands that map to differnet reStructuredText roles (i.e., :role:`text`) MAPPED_COMMANDS = { 'env': 'envvar', 'key': 'kbd', 'cite': 'title' } # Texinfo commands that do not generate any markup, but preserve nested text IGNORED_COMMANDS = ('url', 'sc', 'r', 'sansserif', 'titlefont', 'dmn', 'logo', 'punct') # Texinfo commands that are deleted from output, including nested text DELETED_COMMANDS = () # Map of Texinfo section commands to section underline/overline characters # Single character indicates underline only; double characters indicates overline+underline SECTION_COMMANDS = { # level 1 'top': '**', 'chapter': '**', 'unnumbered': '**', 'appendix': '**', # level 2 'section': '=', 'unnumberedsec': '=', 'appendixsec': '=', # level 3 'subsection': '-', 'unnumberedsubsec': '-', 'appendixsubsec': '-', # level 4 'subsubsection': '^', 'unnumberedsubsubsec': '^', 'appendixsubsubsec': '^', } ############################################################################### ############################################################################### # Global variables. I know. I'm terribly sorry. Please get rid of them. # 'unindent' is used by list items. A li list item is always indented, but its # first line is "unindented" and contains the number or bullet. However, it was # difficult for the li node to tell its #text contents (which may be deeply # nested) to use that. So it just places the number or bullet, which must be 4 # characters, like " 1. ", in "unindent". The first text to be rendered uses # the unindent and then sets it to empty again. unindent = '' hyperlinks = {} # text-target pairs found in "a href" elements ############################################################################### class Ditem: """A document item; usually a node, but can be a block of text resulting from processing adjacent inline items. If it is a node, it is usually the BlockDitem subclass; if it is text, it is normally a plain Ditem.""" def __init__(self, text): self.text = text # Contained text (empty for BlockDitem) self.type = '' # tag for block node, empty for inline self.indentlevel = 0 # 0 - unindented; 1 - indented; etc. def __repr__(self): return self.__class__.__name__+'("""'+self.text+'""")' def propagate_indents(self): "Propagates indent level recursively to children" pass def maxwidth(self): "Width it will occupy if allowed to render on infinite width" self.remove_white_space() return len(self.text) + 4*self.indentlevel def minwidth(self): "Width it will occupy if wrapped as much as possible" wordlens = [len(x) for x in self.text.split()] if wordlens: return max(wordlens) + 4*self.indentlevel else: return 0 def format(self, width): """Returns contents formatted so as not to exceed specified width, if possible""" global unindent if(self.type=='pre'): raise Exception, "What are we doing here?" self.remove_white_space() # Quick hack to fix a problem. Do we begin with '* '? while len(self.text)>=2 and self.text[1]==' ' and self.text[0] in '*-': # It may be mistaken for a bullet list. Strip it. self.text = self.text[2:] if width < self.minwidth(): width = self.minwidth() # The textwrap module has the nasty habit of breaking at hyphens. So # we'll do a nasty hack: find a character that does not exist in the # text, replace all hyphens with that character, ok, you get the point. hyphensurrogate = '' for c in '!@#$%^&*~': if self.text.find(c)<0: hyphensurrogate = c break if not hyphensurrogate: raise Exception, "Houston we have a problem" text = self.text.replace('-', hyphensurrogate) wrapper = textwrap.TextWrapper( initial_indent=((4*self.indentlevel)-len(unindent))*' '+unindent, subsequent_indent=4*self.indentlevel*' ', width=width, break_long_words = False) unindent = '' text = wrapper.fill(text) text = text.replace(hyphensurrogate, '-') return text def empty(self): "Returns true if contains nothing" return not self.text def remove_white_space(self): "Removes extra white space" self.text = re.sub('\s+', ' ', self.text).strip() def canmerge(self): "Tells whether it's possible to merge this Ditem with adjacent ones" return True def merge(self, aditem): """If possible, merges aditem, which should be an adjacent Ditem that comes after this one.""" if not self.canmerge() or not aditem.canmerge(): return False if len(self.text)>0 and self.text[-1] == '_' and len(aditem.text)>0 \ and aditem.text[0] not in """ \n\t:.,!=/|;"'?<>[]{}()""": # Leave space after link if not followed by punctuation self.text = self.text + ' ' + aditem.text else: self.text = self.text + aditem.text return True class BlockDitem(Ditem): "A Ditem which contains other Ditems" def __init__(self, type): Ditem.__init__(self, '') self.type = type self.children = [] # Contained Ditems def __repr__(self): return self.__class__.__name__+'("'+self.type+'"); children = '+repr(self.children) def maxwidth(self): childmaxwidths = [x.maxwidth() for x in self.children] return childmaxwidths and max(childmaxwidths) or 0 def minwidth(self): childminwidths = [x.minwidth() for x in self.children] return childminwidths and max(childminwidths) or 0 def propagate_indents(self): for x in self.children: x.indentlevel = self.indentlevel x.propagate_indents() def format(self, width): if width < self.minwidth(): width = self.minwidth() results = [x.format(width) for x in self.children] results = [x for x in results if x] return "\n\n".join(results) def empty(self): return not (self.children) def canmerge(self): return False class PreDitem(Ditem): "A Ditem representing a literal block" def maxwidth(self): return max([len(x) for x in self.text.split('\n')]) def minwidth(self): return self.maxwidth() # Literal block; width's given def remove_white_space(self): pass def format(self, width): result = '::\n\n' for x in self.text.split('\n'): result = result + ' ' + x + '\n' result = result + '..\n\n' return result def canmerge(self): return False class HeadingDitem(BlockDitem): "A Ditem representing an h1, h2, ..., h9" def __init__(self, type): BlockDitem.__init__(self, type) def minwidth(self): return self.maxwidth() # Headings don't wrap def format(self, width): assert(len(self.children)==1) text = self.children[0].format(32767) uldict = { 'chapter': '**', 'section': '=', 'subsection': '-', 'subsubsection': '^', 'unnumbered': '**', 'unnumberedsec': '=', 'unnumberedsubsec': '-', 'unnumberedsubsubsec': '^', 'appendix': '**', 'appendixsec': '=', 'appendixsubsec': '-', 'appendixsubsubsec': '^' } underliner = uldict[self.type] if len(underliner) == 2: return len(text)*underliner[0] + '\n' + text + '\n' + len(text)*underliner[0] else: return text + '\n' + len(text)*underliner[0] class BlockQuoteDitem(BlockDitem): "A Ditem representing a blockquote" def __init__(self, type): BlockDitem.__init__(self, type) def propagate_indents(self): self.indentlevel = self.indentlevel + 1 BlockDitem.propagate_indents(self) class ListDitem(BlockDitem): "A Ditem representing an ol, ul, or dl" def __init__(self, type): BlockDitem.__init__(self, type) def format(self, width): # First pass the list type and order to the children order = 1 for x in self.children: if isinstance(x, ListItemDitem): x.listtype = self.type x.order = order order = order+1 # And then process normally return BlockDitem.format(self, width) class ListItemDitem(BlockDitem): "A Ditem representing a li, dt, or dd" def __init__(self, type): BlockDitem.__init__(self, type) self.listtype = None self.order = 0 def minwidth(self): if self.type == 'definitionterm': return self.maxwidth() # Don't wrap dt else: return BlockDitem.minwidth(self) def propagate_indents(self): if self.type in ('item', 'definitionitem'): self.indentlevel = self.indentlevel + 1 BlockDitem.propagate_indents(self) def format(self, width): global unindent if self.type == 'item' and self.listtype == 'enumerate': unindent = ('%d. ' % (self.order)).ljust(4) elif self.type == 'item' and self.listtype == 'itemize': unindent = '* ' return BlockDitem.format(self, width) class RenderedColumn: "Width information about a column being rendered" def __init__(self, minwidth, maxwidth): self.minwidth = minwidth self.maxwidth = maxwidth self.curwidth = maxwidth self.fixedwidth = 0 def logwidth(self): if self.maxwidth==0: return 0 else: return math.log(self.maxwidth) def update(self, minwidth, maxwidth): "Replaces minwidth/maxwidth if greater" self.minwidth = minwidth>self.minwidth and minwidth or self.minwidth self.maxwidth = maxwidth>self.maxwidth and maxwidth or self.maxwidth self.curwidth = self.maxwidth class RenderedColumns(UserList.UserList): "A list of RenderedColumn" def __init__(self, alist): self.data = alist def totalWidth(self): "Returns total table width" return reduce(lambda x,y: x+y, [z.curwidth for z in self.data]) \ + len(self.data) + 1 def sumLogWidth(self): "Returns sum of logwidth for nonfixed columns" return reduce(lambda x,y: x+y, [x.logwidth()*(1-x.fixedwidth) for x in self.data]) def distributeWidthDifference(self, width): "Step 4 of w3m table rendering algorithm" # Note: The use of math.ceil below is because I'd rather have a # suboptimal width (a few characters less than requested width) rather # than go find what to do with rounding. w = self.totalWidth() - width assert(w>0) repeat_distribution = 1 while repeat_distribution: repeat_distribution = 0 for x in self.data: if x.fixedwidth: continue if x.curwidth - math.ceil(w*x.logwidth()/self.sumLogWidth()) < \ x.minwidth: x.curwidth = x.minwidth x.fixedwidth = 1 w = self.totalWidth() - width repeat_distribution=1 break # Now that the we finished finding which columns need to be fixed to # their minimum width, perform the distribution once again, without # checking, and actually change remaining column widths for x in self.data: if x.fixedwidth: continue x.curwidth = x.curwidth - math.ceil(w*x.logwidth()/self.sumLogWidth()) def tablehrule(colwidths, rule='-'): "Returns a horizontal table separator for given column widths" result = '+' for x in colwidths: result = result + rule * x + '+' return result class TableDitem(BlockDitem): def __init__(self, type): BlockDitem.__init__(self, type) def format(self, width): # Uses table rendering algorithm of w3m # (http://www.w3m.org/story.html), but ignoring width attribute # Step 1 columns = RenderedColumns([RenderedColumn(x.minwidth(), max(x.maxwidth(), 1) # A column can't be smaller than 1 character ) for x in self.children[0].children]) for x in self.children: for i in range(len(columns)): if (len(x.children)<=i): continue # Skip empty columns columns[i].update(x.children[i].minwidth(), x.children[i].maxwidth()) # Step 2 (width attribute) ignored # Step 3 (already done - list was created with maxwidth) # Step 4 if columns.totalWidth() > width: columns.distributeWidthDifference(width) # OK, column widths are now calculated colwidths = [int(x.curwidth) for x in columns] result = tablehrule(colwidths) + '\n' usedheadbodysep = False for tr in self.children: result = result + tr.format(colwidths) rule = '-' if not usedheadbodysep and tr.children[0].type == 'th' \ and tr!=self.children[-1]: rule = '=' usedheadbodysep = True result = result + tablehrule(colwidths, rule) + '\n' return result class TrDitem(BlockDitem): def __init__(self, type): BlockDitem.__init__(self, type) def maxwidth(self): return reduce(lambda x,y: x+y, [x.maxwidth() for x in self.children]) + len(self.children) + 1 def minwidth(self): return reduce(lambda x,y: x+y, [x.minwidth() for x in self.children]) + len(self.children) + 1 def format(self, colwidths): columns = [] # List of lists of lines maxlinecount = 0 # Num of lines in vertically largest column for i in range(len(colwidths)): if len(self.children)<=i: lines = [ '' ] else: lines = self.children[i].format(colwidths[i]).split('\n') lines = [x + ' ' * (colwidths[i]-len(x)) for x in lines] # Pad to col len maxlinecount = max(maxlinecount, len(lines)) columns.append(lines) # Pad vertically for i in range(len(columns)): for j in range(maxlinecount-len(columns[i])): columns[i].append(' ' * colwidths[i]) result = '' # Add vertical separators for i in range(maxlinecount): result = result + '|' for j in range(len(columns)): result = result + columns[j][i] + '|' result = result + '\n' return result def handleNodeList(nodelist): "Processes given nodes; merges them if possible; returns ditem list" ditems = [] curditem = Ditem('') for node in nodelist: aditem = handleNode(node) if curditem.merge(aditem): continue ditems.append(curditem) curditem = aditem if not curditem.empty(): ditems.append(curditem) return ditems def handleNode(node): if node.nodeType == node.TEXT_NODE: return handleText(node) elif node.nodeName in EMPHASIS_COMMANDS: return handleEmphasis(node) elif node.nodeName in STRONG_COMMANDS: return handleStrong(code) elif node.nodeName in LITERAL_COMMANDS: return handleLiteral(node) elif node.nodeName in VERBATIM_COMMANDS: return handleVerbatimCommand(node) elif node.nodeName in MAPPED_COMMANDS: return handleMappedCommand(node) elif node.nodeName in IGNORED_COMMANDS: return handleIgnoredCommand(node) elif node.nodeName in DELETED_COMMANDS: return handleDeletedCommand(node) elif node.hasChildNodes(): contents = handleNodeList(node.childNodes) if len(contents) == 1: return contents[0] if len(contents) == 0: return Ditem('') result = BlockDitem(node.nodeName) result.children = contents return result return Ditem('') def processChildren(node): if node.hasChildNodes(): return handleNodeList(node.childNodes) else: return () def mergeChildren(node): contents = processChildren(node) if len(contents)>1: raise Exception('Unexpected block elements') if contents: return contents[0] else: return Ditem('') def handleEmphasis(node): result = mergeChildren(node) result.type = node.nodeName if result.text: result.text = '*' + result.text + '*' return result def handleStrong(node): result = mergeChildren(node) result.type = node.nodeName if result.text: result.text = '**' + result.text + '**' return result def handleLiteral(node): result = mergeChildren(node) result.type = node.nodeName if result.text: result.text = '``' + result.text + '``' return result def handleVerbatimCommand(node): result = mergeChildren(node) result.type = node.nodeName if result.text: result.text = ':' + node.nodeName + ':`' + result.text + '`' return result def handleMappedCommand(node): result = mergeChildren(node) result.type = node.nodeName if result.text: result.text = ':' + MAPPED_COMMANDS[node.nodeName] + ':`' + result.text + '`' return result def handleIgnoredCommand(node): result = mergeChildren(node) result.type = node.nodeName return result def handleDeletedCommand(node): result = '' result.type = node.nodeName return result def handleText(node): return Ditem(node.data) def handleAnchor(node): result = mergeChildren(node) result.type = node.nodeName result.text = result.text.strip() if result.text == '': return result target = node.getAttribute('href').strip() result.text = re.sub('\s+', ' ', result.text) result.text = ':ref:`'+result.text+' <'+target+'>`' return result def handleHeading(node): contents = mergeChildren(node) if contents.empty(): return contents result = HeadingDitem(node.parentNode.nodeName) result.children.append(contents) return result def handleGenericBlock(node): result = BlockDitem(node.nodeName) result.children = processChildren(node) return result def handleBlockQuote(node): result = BlockQuoteDitem(node.nodeName) result.children = processChildren(node) return result def handleList(node): result = ListDitem(node.nodeName) result.children = processChildren(node) return result def handleListItem(node): result = ListItemDitem(node.nodeName) result.children = processChildren(node) return result def handleTable(node): result = TableDitem(node.nodeName) # Ignore table contents that are not tr result.children = [x for x in processChildren(node) if x.type=='tr'] return result def handleTr(node): result = TrDitem(node.nodeName) # Ignore tr contents that are not th or td result.children = [x for x in processChildren(node) if x.type in ('th', 'td')] return result def handlePre(node): return PreDitem(mergeChildren(node).text) dom1 = xml.dom.minidom.parse(sys.argv[1]) ditem = handleNode(dom1.getElementsByTagName("texinfo")[0]) ditem.propagate_indents() (utf8_encode, utf8_decode, utf8_reader, utf8_writer) = codecs.lookup('utf-8') outf = utf8_writer(sys.stdout) outf.write(ditem.format(79) + '\n') for h in hyperlinks.keys(): outf.write('\n.. _`' + h + '`:\n ' + hyperlinks[h] + '\n')