Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/opds.py
diff options
context:
space:
mode:
authorRafael Ortiz <rafael@activitycentral.com>2013-03-07 05:38:50 (GMT)
committer Rafael Ortiz <rafael@activitycentral.com>2013-03-07 05:38:50 (GMT)
commitcc02fea1202a9dea2b5c57d4c501889447fa1b59 (patch)
tree5be075bccc887c04b2434c63b6bf23ede2fe98ee /opds.py
Initial commit
Diffstat (limited to 'opds.py')
-rw-r--r--opds.py602
1 files changed, 602 insertions, 0 deletions
diff --git a/opds.py b/opds.py
new file mode 100644
index 0000000..46158f2
--- /dev/null
+++ b/opds.py
@@ -0,0 +1,602 @@
+#! /usr/bin/env python
+
+# Copyright (C) 2009 Sayamindu Dasgupta <sayamindu@laptop.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+from gi.repository import GObject
+from gi.repository import Gtk
+
+from sugar3 import network
+
+import logging
+import threading
+import os
+import urllib
+import time
+import csv
+
+import sys
+sys.path.insert(0, './')
+import feedparser
+
+_logger = logging.getLogger('get-ia-books-activity')
+
+_REL_OPDS_ACQUISTION = u'http://opds-spec.org/acquisition'
+_REL_SUBSECTION = 'subsection'
+_REL_OPDS_POPULAR = u'http://opds-spec.org/sort/popular'
+_REL_OPDS_NEW = u'http://opds-spec.org/sort/new'
+_REL_ALTERNATE = 'alternate'
+_REL_CRAWLABLE = 'http://opds-spec.org/crawlable'
+
+GObject.threads_init()
+
+
+class ReadURLDownloader(network.GlibURLDownloader):
+ """URLDownloader that provides content-length and content-type."""
+
+ def get_content_length(self):
+ """Return the content-length of the download."""
+ if self._info is not None:
+ length = self._info.headers.get('Content-Length')
+ if length is not None:
+ return int(length)
+ else:
+ return 0
+
+ def get_content_type(self):
+ """Return the content-type of the download."""
+ if self._info is not None:
+ return self._info.headers.get('Content-type')
+ return None
+
+
+class DownloadThread(threading.Thread):
+
+ def __init__(self, obj, midway):
+ threading.Thread.__init__(self)
+ self.midway = midway
+ self.obj = obj
+ self.stopthread = threading.Event()
+
+ def _download(self):
+
+ def entry_type(entry):
+ for link in entry['links']:
+ if link['rel'] in \
+ [_REL_OPDS_POPULAR, _REL_OPDS_NEW, _REL_SUBSECTION]:
+ return "CATALOG"
+ else:
+ return 'BOOK'
+
+ logging.debug('feedparser version %s', feedparser.__version__)
+ if not self.obj.is_local() and self.midway == False:
+ uri = self.obj._uri + self.obj._queryterm.replace(' ', '+')
+ headers = {}
+ if self.obj._language is not None and self.obj._language != 'all':
+ headers['Accept-Language'] = self.obj._language
+ uri = uri + '&lang=' + self.obj._language
+ logging.error('Searching URL %s headers %s' % (uri, headers))
+ feedobj = feedparser.parse(uri, request_headers=headers)
+ else:
+ feedobj = feedparser.parse(self.obj._uri)
+
+ # Get catalog Type
+ CATALOG_TYPE = 'COMMON'
+ if 'links' in feedobj['feed']:
+ for link in feedobj['feed']['links']:
+ if link['rel'] == _REL_CRAWLABLE:
+ CATALOG_TYPE = 'CRAWLABLE'
+ break
+
+ for entry in feedobj['entries']:
+ if entry_type(entry) == 'BOOK' and CATALOG_TYPE is not 'CRAWLABLE':
+ self.obj._booklist.append(Book(self.obj._configuration, entry))
+ elif entry_type(entry) == 'CATALOG' or CATALOG_TYPE == 'CRAWLABLE':
+ self.obj._cataloglist.append( \
+ Book(self.obj._configuration, entry))
+
+ self.obj._feedobj = feedobj
+ self.obj._ready = True
+ GObject.idle_add(self.obj.notify_updated, self.midway)
+ return False
+
+ def run(self):
+ self._download()
+
+ def stop(self):
+ self.stopthread.set()
+
+
+class Book(object):
+
+ def __init__(self, configuration, entry, basepath=None):
+ self._entry = entry
+ self._basepath = basepath
+ self._configuration = configuration
+
+ def get_title(self):
+ try:
+ ret = self._entry['title']
+ except KeyError:
+ ret = 'Unknown'
+
+ return ret
+
+ def get_author(self):
+ try:
+ ret = self._entry['author']
+ except KeyError:
+ ret = 'Unknown'
+
+ return ret
+
+ def get_download_links(self):
+ ret = {}
+ for link in self._entry['links']:
+ if link['rel'] == _REL_OPDS_ACQUISTION:
+ if self._basepath is not None and \
+ not (link['href'].startswith('http') or \
+ link['href'].startswith('ftp')):
+ ret[link['type']] = 'file://' \
+ + os.path.join(self._basepath, link['href'])
+ else:
+ ret[link['type']] = link['href']
+ elif link['rel'] in \
+ [_REL_OPDS_POPULAR, _REL_OPDS_NEW, _REL_SUBSECTION]:
+ ret[link['type']] = link['href']
+ elif link['rel'] == _REL_ALTERNATE:
+ ret[link['type']] = link['href']
+ else:
+ pass
+ return ret
+
+ def get_publisher(self):
+ try:
+ ret = self._entry['dcterms_publisher']
+ except KeyError:
+ ret = 'Unknown'
+
+ return ret
+
+ def get_published_year(self):
+ try:
+ ret = self._entry['published']
+ except KeyError:
+ ret = 'Unknown'
+
+ return ret
+
+ def get_language(self):
+ try:
+ ret = self._entry['dcterms_language']
+ except KeyError:
+ ret = 'Unknown'
+
+ return ret
+
+ def get_image_url(self):
+ try:
+ ret = {}
+ for link in self._entry['links']:
+ if link['rel'] == self._configuration['opds_cover']:
+ if self._basepath is not None and \
+ not (link['href'].startswith('http') or \
+ link['href'].startswith('ftp')):
+ ret[link['type']] = 'file://' \
+ + os.path.join(self._basepath, link['href'])
+ else:
+ ret[link['type']] = link['href']
+ except KeyError:
+ ret = 'Unknown'
+ return ret
+
+ def get_summary(self):
+ if self._configuration is not None \
+ and 'summary_field' in self._configuration:
+ try:
+ ret = self._entry[self._configuration['summary_field']]
+ except KeyError:
+ ret = 'Unknown'
+ else:
+ ret = 'Unknown'
+ return ret
+
+ def get_object_id(self):
+ try:
+ ret = self._entry['object_id']
+ except KeyError:
+ ret = 'Unknown'
+
+ return ret
+
+ def match(self, terms):
+ #TODO: Make this more comprehensive
+ for term in terms.split('+'):
+ if term in self.get_title():
+ return True
+ if term in self.get_author():
+ return True
+ if term in self.get_publisher():
+ return True
+ return False
+
+
+class QueryResult(GObject.GObject):
+
+ __gsignals__ = {
+ 'updated': (GObject.SignalFlags.RUN_FIRST,
+ None,
+ ([bool])),
+ }
+
+ def __init__(self, configuration, queryterm, language):
+ GObject.GObject.__init__(self)
+ self._configuration = configuration
+ self._uri = self._configuration['query_uri']
+ self._queryterm = queryterm
+ self._language = language
+ self._feedobj = None
+ self._next_uri = ''
+ self._ready = False
+ self._booklist = []
+ self._cataloglist = []
+ self.threads = []
+ self._start_download()
+
+ def _start_download(self, midway=False):
+ d_thread = DownloadThread(self, midway)
+ self.threads.append(d_thread)
+ d_thread.start()
+
+ def notify_updated(self, midway):
+ self.emit('updated', midway)
+
+ def __len__(self):
+ return len(self._booklist)
+
+ def has_next(self):
+ '''
+ Returns True if more result pages are
+ available for the resultset
+ '''
+ if not 'links' in self._feedobj['feed']:
+ return False
+ for link in self._feedobj['feed']['links']:
+ if link['rel'] == u'next':
+ self._next_uri = link['href']
+ return True
+
+ return False
+
+ def update_with_next(self):
+ '''
+ Updates the booklist with the next resultset
+ '''
+ if len(self._next_uri) > 0:
+ self._ready = False
+ self._uri = self._next_uri
+ self.cancel() # XXX: Is this needed ?
+ self._start_download(midway=True)
+
+ def cancel(self):
+ '''
+ Cancels the query job
+ '''
+ for d_thread in self.threads:
+ d_thread.stop()
+
+ def get_book_n(self, n):
+ '''
+ Gets the n-th book
+ '''
+ return self._booklist[n]
+
+ def get_book_list(self):
+ '''
+ Gets the entire booklist
+ '''
+ return self._booklist
+
+ def get_catalog_list(self):
+ '''
+ Gets the entire catalog list
+ '''
+ return self._cataloglist
+
+ def is_ready(self):
+ '''
+ Returns False if a query is in progress
+ '''
+ return self._ready
+
+ def is_local(self):
+ '''
+ Returns True in case of a local school
+ server or a local device
+ (yay! for sneakernet)
+ '''
+ return False
+
+
+class LocalVolumeQueryResult(QueryResult):
+
+ def __init__(self, path, queryterm, language):
+ configuration = {'query_uri': os.path.join(path, 'catalog.xml')}
+ QueryResult.__init__(self, configuration, queryterm, language)
+
+ def is_local(self):
+ return True
+
+ def get_book_list(self):
+ ret = []
+ if self._queryterm is None or self._queryterm is '':
+ for entry in self._feedobj['entries']:
+ ret.append(Book(entry, basepath=os.path.dirname(self._uri)))
+ else:
+ for entry in self._feedobj['entries']:
+ book = Book(entry, basepath=os.path.dirname(self._uri))
+ if book.match(self._queryterm.replace(' ', '+')):
+ ret.append(book)
+ return ret
+
+
+class RemoteQueryResult(QueryResult):
+
+ def __init__(self, configuration, queryterm, language):
+ QueryResult.__init__(self, configuration, queryterm, language)
+
+
+class IABook(Book):
+
+ def __init__(self, configuration, entry, basepath=None):
+ Book.__init__(self, configuration, entry, basepath=None)
+
+ def get_download_links(self):
+ return self._entry['links']
+
+ def get_image_url(self):
+ return {'jpg': self._entry['cover_image']}
+
+
+class DownloadIAThread(threading.Thread):
+
+ def __init__(self, obj, midway):
+ threading.Thread.__init__(self)
+ self.midway = midway
+ self.obj = obj
+ self._download_content_length = 0
+ self._download_content_type = None
+ self._booklist = []
+ queryterm = self.obj._queryterm
+ # search_tuple = queryterm.lower().split()
+ FL = urllib.quote('fl[]')
+ SORT = urllib.quote('sort[]')
+ self.search_url = 'http://www.archive.org/advancedsearch.php?q=' + \
+ urllib.quote('(title:(' + self.obj._queryterm.lower() + ') OR ' + \
+ 'creator:(' + queryterm.lower() + ')) AND format:(DJVU)')
+ self.search_url += '&' + FL + '=creator&' + FL + '=description&' + \
+ FL + '=format&' + FL + '=identifier&' + FL + '=language'
+ self.search_url += '&' + FL + '=publisher&' + FL + '=title&' + \
+ FL + '=volume'
+ self.search_url += '&' + SORT + '=title&' + SORT + '&' + \
+ SORT + '=&rows=500&save=yes&fmt=csv&xmlsearch=Search'
+ self.stopthread = threading.Event()
+
+ def _download(self):
+ GObject.idle_add(self.download_csv, self.search_url)
+
+ def download_csv(self, url):
+ logging.error('get csv from %s', url)
+ path = os.path.join(self.obj._activity.get_activity_root(), 'instance',
+ 'tmp%i.csv' % time.time())
+ print 'path=', path
+ getter = ReadURLDownloader(url)
+ getter.connect("finished", self._get_csv_result_cb)
+ getter.connect("progress", self._get_csv_progress_cb)
+ getter.connect("error", self._get_csv_error_cb)
+ _logger.debug("Starting download to %s...", path)
+ try:
+ getter.start(path)
+ except:
+ pass
+ self._download_content_type = getter.get_content_type()
+
+ def _get_csv_progress_cb(self, getter, bytes_downloaded):
+ if self._download_content_length > 0:
+ _logger.debug("Downloaded %u of %u bytes...",
+ bytes_downloaded, self._download_content_length)
+ else:
+ _logger.debug("Downloaded %u bytes...",
+ bytes_downloaded)
+
+ def _get_csv_error_cb(self, getter, err):
+ _logger.debug("Error getting CSV: %s", err)
+ self._download_content_length = 0
+ self._download_content_type = None
+
+ def _get_csv_result_cb(self, getter, tempfile, suggested_name):
+ print 'Content type:', self._download_content_type
+ if self._download_content_type.startswith('text/html'):
+ # got an error page instead
+ self._get_csv_error_cb(getter, 'HTTP Error')
+ return
+ self.process_downloaded_csv(tempfile, suggested_name)
+
+ def process_downloaded_csv(self, tempfile, suggested_name):
+ reader = csv.reader(open(tempfile, 'rb'))
+ reader.next() # skip the first header row.
+ for row in reader:
+ if len(row) < 7:
+ _logger.debug("Server Error: %s", self.search_url)
+ return
+ entry = {}
+ entry['author'] = row[0]
+ entry['description'] = row[1]
+ entry['format'] = row[2]
+ entry['identifier'] = row[3]
+ entry['dcterms_language'] = row[4]
+ entry['dcterms_publisher'] = row[5]
+ entry['title'] = row[6]
+ volume = row[7]
+ if volume is not None and len(volume) > 0:
+ entry['title'] = row[6] + 'Volume ' + volume
+
+ entry['links'] = {}
+ url_base = 'http://www.archive.org/download/' + \
+ row[3] + '/' + row[3]
+
+ if entry['format'].find('DjVu') > -1:
+ entry['links']['image/x.djvu'] = url_base + '.djvu'
+ if entry['format'].find('Grayscale LuraTech PDF') > -1:
+ # Fake mime type
+ entry['links']['application/pdf-bw'] = url_base + '_bw.pdf'
+ if entry['format'].find('PDF') > -1:
+ entry['links']['application/pdf'] = url_base + '_text.pdf'
+ if entry['format'].find('EPUB') > -1:
+ entry['links']['application/epub+zip'] = url_base + '.epub'
+ entry['cover_image'] = 'http://www.archive.org/download/' + \
+ row[3] + '/page/cover_thumb.jpg'
+
+ self.obj._booklist.append(IABook(None, entry, ''))
+
+ os.remove(tempfile)
+ GObject.idle_add(self.obj.notify_updated, self.midway)
+ self.obj._ready = True
+ return False
+
+ def run(self):
+ self._download()
+
+ def stop(self):
+ self.stopthread.set()
+
+
+class InternetArchiveQueryResult(QueryResult):
+
+ # Search in internet archive does not use OPDS
+ # because the server implementation is not working very well
+
+ def __init__(self, queryterm, language, activity):
+ GObject.GObject.__init__(self)
+ self._activity = activity
+ self._queryterm = queryterm
+ self._language = language
+ self._next_uri = ''
+ self._ready = False
+ self._booklist = []
+ self._cataloglist = []
+ self.threads = []
+ self._start_download()
+
+ def notify_updated(self, midway):
+ self.emit('updated', midway)
+
+ def _start_download(self, midway=False):
+ d_thread = DownloadIAThread(self, midway)
+ self.threads.append(d_thread)
+ d_thread.start()
+
+
+class ImageDownloaderThread(threading.Thread):
+
+ def __init__(self, obj):
+ threading.Thread.__init__(self)
+ self.obj = obj
+ self._getter = ReadURLDownloader(self.obj._url)
+ self._download_content_length = 0
+ self._download_content_type = None
+ self.stopthread = threading.Event()
+
+ def _download_image(self):
+ path = os.path.join(self.obj._activity.get_activity_root(),
+ 'instance', 'tmp%i' % time.time())
+ self._getter.connect("finished", self._get_image_result_cb)
+ self._getter.connect("progress", self._get_image_progress_cb)
+ self._getter.connect("error", self._get_image_error_cb)
+ _logger.debug("Starting download to %s...", path)
+ try:
+ self._getter.start(path)
+ except:
+ _logger.debug("Connection timed out for")
+ GObject.idle_add(self.obj.notify_updated, None)
+
+ self._download_content_length = \
+ self._getter.get_content_length()
+ self._download_content_type = self._getter.get_content_type()
+
+ def _get_image_result_cb(self, getter, tempfile, suggested_name):
+ _logger.debug("Got Cover Image %s (%s)", tempfile, suggested_name)
+ self._getter = None
+ if not self.stopthread.is_set():
+ GObject.idle_add(self.obj.notify_updated, tempfile)
+
+ def _get_image_progress_cb(self, getter, bytes_downloaded):
+ if self.stopthread.is_set():
+ try:
+ _logger.debug('The download %s was cancelled' % getter._fname)
+ getter.cancel()
+ except:
+ _logger.debug('Got an exception while trying ' + \
+ 'to cancel download')
+ if self._download_content_length > 0:
+ _logger.debug("Downloaded %u of %u bytes...", bytes_downloaded,
+ self._download_content_length)
+ else:
+ _logger.debug("Downloaded %u bytes...",
+ bytes_downloaded)
+ while Gtk.events_pending():
+ Gtk.main_iteration()
+
+ def _get_image_error_cb(self, getter, err):
+ _logger.debug("Error getting image: %s", err)
+ self._download_content_length = 0
+ self._download_content_type = None
+ self._getter = None
+ GObject.idle_add(self.obj.notify_updated, None)
+
+ def run(self):
+ self._download_image()
+
+ def stop(self):
+ self.stopthread.set()
+
+
+class ImageDownloader(GObject.GObject):
+
+ __gsignals__ = {
+ 'updated': (GObject.SignalFlags.RUN_FIRST,
+ None,
+ ([GObject.TYPE_STRING])),
+ }
+
+ def __init__(self, activity, url):
+ GObject.GObject.__init__(self)
+ self.threads = []
+ self._activity = activity
+ self._url = url
+ self._start_download()
+
+ def _start_download(self):
+ d_thread = ImageDownloaderThread(self)
+ self.threads.append(d_thread)
+ d_thread.start()
+
+ def notify_updated(self, temp_file):
+ self.emit('updated', temp_file)
+
+ def stop_download(self):
+ for thread in self.threads:
+ thread.stop()