1 files changed, 119 insertions, 0 deletions
diff --git a/i18n/html2po.py b/i18n/html2po.py
new file mode 100755
index 0000000..9d1b2ef
--- /dev/null
+++ b/i18n/html2po.py
@@ -0,0 +1,119 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Copyright Peter Gijsels, 2010
+# under the MIT license http://www.opensource.org/licenses/mit-license.php
+
+import sys
+sys.path.append(sys.path[0] + '/BeautifulSoup-3.0.8')
+
+import re
+import time
+import BeautifulSoup
+
+def tag_text(tag):
+    return ''.join([str(c) for c in tag.contents])
+
+
+# Adapted from pygettext.py in the standard Python distribution
+
+pot_header = '''\
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR ORGANIZATION
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\\n"
+"POT-Creation-Date: %(time)s\\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
+"Language-Team: LANGUAGE <LL@li.org>\\n"
+"MIME-Version: 1.0\\n"
+"Content-Type: text/plain; charset=CHARSET\\n"
+"Content-Transfer-Encoding: ENCODING\\n"
+"Generated-By: html2po.py %(version)s\\n"
+
+'''
+
+__version__ = '1.0'
+EMPTYSTRING = ''
+
+escapes = []
+
+def make_escapes(pass_iso8859):
+    global escapes
+    if pass_iso8859:
+        # Allow iso-8859 characters to pass through so that e.g. 'msgid
+        # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
+        # escape any character outside the 32..126 range.
+        mod = 128
+    else:
+        mod = 256
+    for i in range(256):
+        if 32 <= (i % mod) <= 126:
+            escapes.append(chr(i))
+        else:
+            escapes.append("\\%03o" % i)
+    escapes[ord('\\')] = '\\\\'
+    escapes[ord('\t')] = '\\t'
+    escapes[ord('\r')] = '\\r'
+    escapes[ord('\n')] = '\\n'
+    escapes[ord('\"')] = '\\"'
+
+make_escapes(True)
+
+def escape(s):
+    global escapes
+    s = list(s)
+    for i in range(len(s)):
+        s[i] = escapes[ord(s[i])]
+    return EMPTYSTRING.join(s)
+
+def normalize(s):
+    # This converts the various Python string types into a format that is
+    # appropriate for .po files, namely much closer to C style.
+    lines = s.split('\n')
+    if len(lines) == 1:
+        s = '"' + escape(s) + '"'
+    else:
+        if not lines[-1]:
+            del lines[-1]
+            lines[-1] = lines[-1] + '\n'
+        for i in range(len(lines)):
+            lines[i] = escape(lines[i])
+        lineterm = '\\n"\n"'
+        s = '""\n"' + lineterm.join(lines) + '"'
+    return s
+
+def write(list, fp):
+    timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
+    # The time stamp in the header doesn't have the same format as that
+    # generated by xgettext...
+    print >> fp, pot_header % {'time': timestamp, 'version': __version__}
+    list.sort()
+    for k in list:
+        print >> fp, 'msgid', normalize(k)
+        print >> fp, 'msgstr ""\n'
+
+# End of the code adapted from pygettext.py
+
+
+def html2po(html, po):
+    soup = BeautifulSoup.BeautifulSoup(open(html))
+    translatables = soup.findAll(
+        attrs = {'class' : re.compile('\\btranslate\\b')}
+        )
+    write(map(tag_text, translatables), open(po, 'w'))
+
+if __name__ == '__main__':
+    from optparse import OptionParser
+    parser = OptionParser()
+    parser.add_option('-i', '--input', dest='html',
+                      help='input html file', metavar='FILE',
+                      default='index.html')
+    parser.add_option('-o', '--output', dest='po',
+                      help='output po file', metavar='FILE',
+                      default='messages.po')
+    (options, args) = parser.parse_args()
+    html2po(options.html, options.po)