1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
"""
olpc.datastore.converter
~~~~~~~~~~~~~~~~~~~~
Convert binary formats to unicode text for indexing.
Normally we'd make heavy reliance on 3rd party tools to do
conversion. In the olpc use-case we want to minimize such
dependencies. As such we make a minimal attempt to extract what text
we can.
"""
__author__ = 'Benjamin Saller <bcsaller@objectrealms.net>'
__docformat__ = 'restructuredtext'
__copyright__ = 'Copyright ObjectRealms, LLC, 2007'
__license__ = 'The GNU Public License V2+'
from olpc.datastore.utils import Singleton
import codecs
import logging
import os
import subprocess
import sys
import tempfile
import gnomevfs
def guess_mimetype(filename):
fn = os.path.abspath(filename)
mimetype = gnomevfs.get_mime_type(fn)
return mimetype
class subprocessconverter(object):
"""Process a command. Collect the output
commands will have the following variables available to them for
substitution. 'source' is required and is the input file.
'target' is optional, but if its omitted the subprocessconverter
must supply an implict_target(source) method which returns the
name of the expected output.
A file object opened for reading will be returned to be passed to
the indexer.
%(source)s
%(target)s
pdftotext %(source)s %s(target)s
"""
def __init__(self, cmd, find_target=None):
self.raw = cmd
self.require_target = False
self.find_target = find_target
if '%(source)s' not in cmd:
raise ValueError("doesn't handle source")
if '%(target)s' not in cmd:
if not callable(find_target):
raise ValueError("no way of locating conversion target")
self.require_target = True
def verify(self):
"""should this converter be used?"""
return os.path.exists(self.raw.split()[0])
def __call__(self, filename):
data = {}
data['source'] = filename
if self.require_target:
# XXX: methods that return something bad here
# will result in the wrong thing being unlinked
target = data['target'] = self.find_target(filename)
else:
target = data['target'] = tempfile.mkstemp()[1]
cmd = self.raw % data
try:
cmd = cmd.split()
# the stderr capture here will hide glib error messages
# from converters which shouldn't be generating output anyway
retcode = subprocess.call(cmd, stderr=subprocess.PIPE)
if retcode: return None
return codecs.open(target, 'r', 'utf-8')
except UnicodeDecodeError:
# The data was an unknown type but couldn't be understood
# as text so we don't attempt to index it. This most
# likely means its just an unknown binary format.
return None
finally:
# we unlink the file as its already been opened for
# reading
if os.path.exists(target):
os.unlink(target)
class noop(object):
def verify(self): return True
def __call__(self, filename):
return codecs.open(filename, 'r', 'utf-8')
class Converter(object):
__metaclass__ = Singleton
def __init__(self):
# maps both extension -> plugin
# and mimetype -> plugin
self._converters = {}
self._default = None
self.logger = logging.getLogger('org.laptop.sugar.Indexer')
def registerConverter(self, ext_or_mime, plugin):
if plugin.verify():
self._converters[ext_or_mime] = plugin
if self._default is None: self._default = plugin
def __call__(self, filename, encoding=None, mimetype=None):
"""Convert filename's content to utf-8 encoded text."""
#encoding is passed its the known encoding of the
#contents. When None is passed the encoding is guessed which
#can result in unexpected or no output.
if mimetype: mt = mimetype
else: mt = guess_mimetype(filename)
maintype, subtype = mt.split('/',1)
converter = self._converters.get(mt)
if not converter:
converter = self._default
# it was an image or an unknown application
if maintype in ['image', 'application', 'audio', 'video'] or \
subtype in ['x-trash', 'x-python-bytecode',]:
converter = None
if converter:
try: return converter(filename)
except:
logging.debug("Binary to Text failed: %s %s" %
(mt, filename), exc_info=sys.exc_info())
return None
# our global instance
converter = Converter()
# TXT
txt = noop()
converter.registerConverter('.txt', txt)
converter.registerConverter('.html', txt)
converter.registerConverter('text/plain', txt)
converter.registerConverter('text/html', txt)
# PDF
pdf2txt = subprocessconverter('/usr/bin/pdftotext -nopgbrk -enc UTF-8 %(source)s %(target)s')
converter.registerConverter('.pdf', pdf2txt)
converter.registerConverter('application/pdf', pdf2txt)
# DOC
def find_by_ext(filename, ext="txt"):
return "%s.%s" % (os.path.splitext(filename)[0], ext)
doctotext = subprocessconverter('/usr/bin/abiword -t txt %(source)s', find_by_ext)
converter.registerConverter('.doc', doctotext)
converter.registerConverter('application/msword', doctotext)
# ODT
odt2txt = subprocessconverter('/usr/bin/odt2txt --encoding=UTF-8 --output=%(target)s %(source)s')
converter.registerConverter('.odt', odt2txt)
converter.registerConverter('application/vnd.oasis.opendocument.text', odt2txt)
|