diff options
author | Rafael Ortiz <rafael@activitycentral.com> | 2013-03-07 05:38:50 (GMT) |
---|---|---|
committer | Rafael Ortiz <rafael@activitycentral.com> | 2013-03-07 05:38:50 (GMT) |
commit | cc02fea1202a9dea2b5c57d4c501889447fa1b59 (patch) | |
tree | 5be075bccc887c04b2434c63b6bf23ede2fe98ee /opds.py |
Initial commit
Diffstat (limited to 'opds.py')
-rw-r--r-- | opds.py | 602 |
1 files changed, 602 insertions, 0 deletions
@@ -0,0 +1,602 @@ +#! /usr/bin/env python + +# Copyright (C) 2009 Sayamindu Dasgupta <sayamindu@laptop.org> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +from gi.repository import GObject +from gi.repository import Gtk + +from sugar3 import network + +import logging +import threading +import os +import urllib +import time +import csv + +import sys +sys.path.insert(0, './') +import feedparser + +_logger = logging.getLogger('get-ia-books-activity') + +_REL_OPDS_ACQUISTION = u'http://opds-spec.org/acquisition' +_REL_SUBSECTION = 'subsection' +_REL_OPDS_POPULAR = u'http://opds-spec.org/sort/popular' +_REL_OPDS_NEW = u'http://opds-spec.org/sort/new' +_REL_ALTERNATE = 'alternate' +_REL_CRAWLABLE = 'http://opds-spec.org/crawlable' + +GObject.threads_init() + + +class ReadURLDownloader(network.GlibURLDownloader): + """URLDownloader that provides content-length and content-type.""" + + def get_content_length(self): + """Return the content-length of the download.""" + if self._info is not None: + length = self._info.headers.get('Content-Length') + if length is not None: + return int(length) + else: + return 0 + + def get_content_type(self): + """Return the content-type of the download.""" + if self._info is not None: + return self._info.headers.get('Content-type') + return None + + +class DownloadThread(threading.Thread): + + def __init__(self, obj, midway): + threading.Thread.__init__(self) + self.midway = midway + self.obj = obj + self.stopthread = threading.Event() + + def _download(self): + + def entry_type(entry): + for link in entry['links']: + if link['rel'] in \ + [_REL_OPDS_POPULAR, _REL_OPDS_NEW, _REL_SUBSECTION]: + return "CATALOG" + else: + return 'BOOK' + + logging.debug('feedparser version %s', feedparser.__version__) + if not self.obj.is_local() and self.midway == False: + uri = self.obj._uri + self.obj._queryterm.replace(' ', '+') + headers = {} + if self.obj._language is not None and self.obj._language != 'all': + headers['Accept-Language'] = self.obj._language + uri = uri + '&lang=' + self.obj._language + logging.error('Searching URL %s headers %s' % (uri, headers)) + feedobj = feedparser.parse(uri, request_headers=headers) + else: + feedobj = feedparser.parse(self.obj._uri) + + # Get catalog Type + CATALOG_TYPE = 'COMMON' + if 'links' in feedobj['feed']: + for link in feedobj['feed']['links']: + if link['rel'] == _REL_CRAWLABLE: + CATALOG_TYPE = 'CRAWLABLE' + break + + for entry in feedobj['entries']: + if entry_type(entry) == 'BOOK' and CATALOG_TYPE is not 'CRAWLABLE': + self.obj._booklist.append(Book(self.obj._configuration, entry)) + elif entry_type(entry) == 'CATALOG' or CATALOG_TYPE == 'CRAWLABLE': + self.obj._cataloglist.append( \ + Book(self.obj._configuration, entry)) + + self.obj._feedobj = feedobj + self.obj._ready = True + GObject.idle_add(self.obj.notify_updated, self.midway) + return False + + def run(self): + self._download() + + def stop(self): + self.stopthread.set() + + +class Book(object): + + def __init__(self, configuration, entry, basepath=None): + self._entry = entry + self._basepath = basepath + self._configuration = configuration + + def get_title(self): + try: + ret = self._entry['title'] + except KeyError: + ret = 'Unknown' + + return ret + + def get_author(self): + try: + ret = self._entry['author'] + except KeyError: + ret = 'Unknown' + + return ret + + def get_download_links(self): + ret = {} + for link in self._entry['links']: + if link['rel'] == _REL_OPDS_ACQUISTION: + if self._basepath is not None and \ + not (link['href'].startswith('http') or \ + link['href'].startswith('ftp')): + ret[link['type']] = 'file://' \ + + os.path.join(self._basepath, link['href']) + else: + ret[link['type']] = link['href'] + elif link['rel'] in \ + [_REL_OPDS_POPULAR, _REL_OPDS_NEW, _REL_SUBSECTION]: + ret[link['type']] = link['href'] + elif link['rel'] == _REL_ALTERNATE: + ret[link['type']] = link['href'] + else: + pass + return ret + + def get_publisher(self): + try: + ret = self._entry['dcterms_publisher'] + except KeyError: + ret = 'Unknown' + + return ret + + def get_published_year(self): + try: + ret = self._entry['published'] + except KeyError: + ret = 'Unknown' + + return ret + + def get_language(self): + try: + ret = self._entry['dcterms_language'] + except KeyError: + ret = 'Unknown' + + return ret + + def get_image_url(self): + try: + ret = {} + for link in self._entry['links']: + if link['rel'] == self._configuration['opds_cover']: + if self._basepath is not None and \ + not (link['href'].startswith('http') or \ + link['href'].startswith('ftp')): + ret[link['type']] = 'file://' \ + + os.path.join(self._basepath, link['href']) + else: + ret[link['type']] = link['href'] + except KeyError: + ret = 'Unknown' + return ret + + def get_summary(self): + if self._configuration is not None \ + and 'summary_field' in self._configuration: + try: + ret = self._entry[self._configuration['summary_field']] + except KeyError: + ret = 'Unknown' + else: + ret = 'Unknown' + return ret + + def get_object_id(self): + try: + ret = self._entry['object_id'] + except KeyError: + ret = 'Unknown' + + return ret + + def match(self, terms): + #TODO: Make this more comprehensive + for term in terms.split('+'): + if term in self.get_title(): + return True + if term in self.get_author(): + return True + if term in self.get_publisher(): + return True + return False + + +class QueryResult(GObject.GObject): + + __gsignals__ = { + 'updated': (GObject.SignalFlags.RUN_FIRST, + None, + ([bool])), + } + + def __init__(self, configuration, queryterm, language): + GObject.GObject.__init__(self) + self._configuration = configuration + self._uri = self._configuration['query_uri'] + self._queryterm = queryterm + self._language = language + self._feedobj = None + self._next_uri = '' + self._ready = False + self._booklist = [] + self._cataloglist = [] + self.threads = [] + self._start_download() + + def _start_download(self, midway=False): + d_thread = DownloadThread(self, midway) + self.threads.append(d_thread) + d_thread.start() + + def notify_updated(self, midway): + self.emit('updated', midway) + + def __len__(self): + return len(self._booklist) + + def has_next(self): + ''' + Returns True if more result pages are + available for the resultset + ''' + if not 'links' in self._feedobj['feed']: + return False + for link in self._feedobj['feed']['links']: + if link['rel'] == u'next': + self._next_uri = link['href'] + return True + + return False + + def update_with_next(self): + ''' + Updates the booklist with the next resultset + ''' + if len(self._next_uri) > 0: + self._ready = False + self._uri = self._next_uri + self.cancel() # XXX: Is this needed ? + self._start_download(midway=True) + + def cancel(self): + ''' + Cancels the query job + ''' + for d_thread in self.threads: + d_thread.stop() + + def get_book_n(self, n): + ''' + Gets the n-th book + ''' + return self._booklist[n] + + def get_book_list(self): + ''' + Gets the entire booklist + ''' + return self._booklist + + def get_catalog_list(self): + ''' + Gets the entire catalog list + ''' + return self._cataloglist + + def is_ready(self): + ''' + Returns False if a query is in progress + ''' + return self._ready + + def is_local(self): + ''' + Returns True in case of a local school + server or a local device + (yay! for sneakernet) + ''' + return False + + +class LocalVolumeQueryResult(QueryResult): + + def __init__(self, path, queryterm, language): + configuration = {'query_uri': os.path.join(path, 'catalog.xml')} + QueryResult.__init__(self, configuration, queryterm, language) + + def is_local(self): + return True + + def get_book_list(self): + ret = [] + if self._queryterm is None or self._queryterm is '': + for entry in self._feedobj['entries']: + ret.append(Book(entry, basepath=os.path.dirname(self._uri))) + else: + for entry in self._feedobj['entries']: + book = Book(entry, basepath=os.path.dirname(self._uri)) + if book.match(self._queryterm.replace(' ', '+')): + ret.append(book) + return ret + + +class RemoteQueryResult(QueryResult): + + def __init__(self, configuration, queryterm, language): + QueryResult.__init__(self, configuration, queryterm, language) + + +class IABook(Book): + + def __init__(self, configuration, entry, basepath=None): + Book.__init__(self, configuration, entry, basepath=None) + + def get_download_links(self): + return self._entry['links'] + + def get_image_url(self): + return {'jpg': self._entry['cover_image']} + + +class DownloadIAThread(threading.Thread): + + def __init__(self, obj, midway): + threading.Thread.__init__(self) + self.midway = midway + self.obj = obj + self._download_content_length = 0 + self._download_content_type = None + self._booklist = [] + queryterm = self.obj._queryterm + # search_tuple = queryterm.lower().split() + FL = urllib.quote('fl[]') + SORT = urllib.quote('sort[]') + self.search_url = 'http://www.archive.org/advancedsearch.php?q=' + \ + urllib.quote('(title:(' + self.obj._queryterm.lower() + ') OR ' + \ + 'creator:(' + queryterm.lower() + ')) AND format:(DJVU)') + self.search_url += '&' + FL + '=creator&' + FL + '=description&' + \ + FL + '=format&' + FL + '=identifier&' + FL + '=language' + self.search_url += '&' + FL + '=publisher&' + FL + '=title&' + \ + FL + '=volume' + self.search_url += '&' + SORT + '=title&' + SORT + '&' + \ + SORT + '=&rows=500&save=yes&fmt=csv&xmlsearch=Search' + self.stopthread = threading.Event() + + def _download(self): + GObject.idle_add(self.download_csv, self.search_url) + + def download_csv(self, url): + logging.error('get csv from %s', url) + path = os.path.join(self.obj._activity.get_activity_root(), 'instance', + 'tmp%i.csv' % time.time()) + print 'path=', path + getter = ReadURLDownloader(url) + getter.connect("finished", self._get_csv_result_cb) + getter.connect("progress", self._get_csv_progress_cb) + getter.connect("error", self._get_csv_error_cb) + _logger.debug("Starting download to %s...", path) + try: + getter.start(path) + except: + pass + self._download_content_type = getter.get_content_type() + + def _get_csv_progress_cb(self, getter, bytes_downloaded): + if self._download_content_length > 0: + _logger.debug("Downloaded %u of %u bytes...", + bytes_downloaded, self._download_content_length) + else: + _logger.debug("Downloaded %u bytes...", + bytes_downloaded) + + def _get_csv_error_cb(self, getter, err): + _logger.debug("Error getting CSV: %s", err) + self._download_content_length = 0 + self._download_content_type = None + + def _get_csv_result_cb(self, getter, tempfile, suggested_name): + print 'Content type:', self._download_content_type + if self._download_content_type.startswith('text/html'): + # got an error page instead + self._get_csv_error_cb(getter, 'HTTP Error') + return + self.process_downloaded_csv(tempfile, suggested_name) + + def process_downloaded_csv(self, tempfile, suggested_name): + reader = csv.reader(open(tempfile, 'rb')) + reader.next() # skip the first header row. + for row in reader: + if len(row) < 7: + _logger.debug("Server Error: %s", self.search_url) + return + entry = {} + entry['author'] = row[0] + entry['description'] = row[1] + entry['format'] = row[2] + entry['identifier'] = row[3] + entry['dcterms_language'] = row[4] + entry['dcterms_publisher'] = row[5] + entry['title'] = row[6] + volume = row[7] + if volume is not None and len(volume) > 0: + entry['title'] = row[6] + 'Volume ' + volume + + entry['links'] = {} + url_base = 'http://www.archive.org/download/' + \ + row[3] + '/' + row[3] + + if entry['format'].find('DjVu') > -1: + entry['links']['image/x.djvu'] = url_base + '.djvu' + if entry['format'].find('Grayscale LuraTech PDF') > -1: + # Fake mime type + entry['links']['application/pdf-bw'] = url_base + '_bw.pdf' + if entry['format'].find('PDF') > -1: + entry['links']['application/pdf'] = url_base + '_text.pdf' + if entry['format'].find('EPUB') > -1: + entry['links']['application/epub+zip'] = url_base + '.epub' + entry['cover_image'] = 'http://www.archive.org/download/' + \ + row[3] + '/page/cover_thumb.jpg' + + self.obj._booklist.append(IABook(None, entry, '')) + + os.remove(tempfile) + GObject.idle_add(self.obj.notify_updated, self.midway) + self.obj._ready = True + return False + + def run(self): + self._download() + + def stop(self): + self.stopthread.set() + + +class InternetArchiveQueryResult(QueryResult): + + # Search in internet archive does not use OPDS + # because the server implementation is not working very well + + def __init__(self, queryterm, language, activity): + GObject.GObject.__init__(self) + self._activity = activity + self._queryterm = queryterm + self._language = language + self._next_uri = '' + self._ready = False + self._booklist = [] + self._cataloglist = [] + self.threads = [] + self._start_download() + + def notify_updated(self, midway): + self.emit('updated', midway) + + def _start_download(self, midway=False): + d_thread = DownloadIAThread(self, midway) + self.threads.append(d_thread) + d_thread.start() + + +class ImageDownloaderThread(threading.Thread): + + def __init__(self, obj): + threading.Thread.__init__(self) + self.obj = obj + self._getter = ReadURLDownloader(self.obj._url) + self._download_content_length = 0 + self._download_content_type = None + self.stopthread = threading.Event() + + def _download_image(self): + path = os.path.join(self.obj._activity.get_activity_root(), + 'instance', 'tmp%i' % time.time()) + self._getter.connect("finished", self._get_image_result_cb) + self._getter.connect("progress", self._get_image_progress_cb) + self._getter.connect("error", self._get_image_error_cb) + _logger.debug("Starting download to %s...", path) + try: + self._getter.start(path) + except: + _logger.debug("Connection timed out for") + GObject.idle_add(self.obj.notify_updated, None) + + self._download_content_length = \ + self._getter.get_content_length() + self._download_content_type = self._getter.get_content_type() + + def _get_image_result_cb(self, getter, tempfile, suggested_name): + _logger.debug("Got Cover Image %s (%s)", tempfile, suggested_name) + self._getter = None + if not self.stopthread.is_set(): + GObject.idle_add(self.obj.notify_updated, tempfile) + + def _get_image_progress_cb(self, getter, bytes_downloaded): + if self.stopthread.is_set(): + try: + _logger.debug('The download %s was cancelled' % getter._fname) + getter.cancel() + except: + _logger.debug('Got an exception while trying ' + \ + 'to cancel download') + if self._download_content_length > 0: + _logger.debug("Downloaded %u of %u bytes...", bytes_downloaded, + self._download_content_length) + else: + _logger.debug("Downloaded %u bytes...", + bytes_downloaded) + while Gtk.events_pending(): + Gtk.main_iteration() + + def _get_image_error_cb(self, getter, err): + _logger.debug("Error getting image: %s", err) + self._download_content_length = 0 + self._download_content_type = None + self._getter = None + GObject.idle_add(self.obj.notify_updated, None) + + def run(self): + self._download_image() + + def stop(self): + self.stopthread.set() + + +class ImageDownloader(GObject.GObject): + + __gsignals__ = { + 'updated': (GObject.SignalFlags.RUN_FIRST, + None, + ([GObject.TYPE_STRING])), + } + + def __init__(self, activity, url): + GObject.GObject.__init__(self) + self.threads = [] + self._activity = activity + self._url = url + self._start_download() + + def _start_download(self): + d_thread = ImageDownloaderThread(self) + self.threads.append(d_thread) + d_thread.start() + + def notify_updated(self, temp_file): + self.emit('updated', temp_file) + + def stop_download(self): + for thread in self.threads: + thread.stop() |