""" Python wrapper for Google web APIs This module allows you to access Google's web APIs through SOAP, to do things like search Google and get the results programmatically. Described U{here } You need a Google-provided license key to use these services. Follow the link above to get one. These functions will look in several places (in this order) for the license key: - the "license_key" argument of each function - the module-level LICENSE_KEY variable (call setLicense once to set it) - an environment variable called GOOGLE_LICENSE_KEY - a file called ".googlekey" in the current directory - a file called "googlekey.txt" in the current directory - a file called ".googlekey" in your home directory - a file called "googlekey.txt" in your home directory - a file called ".googlekey" in the same directory as google.py - a file called "googlekey.txt" in the same directory as google.py Sample usage:: >>> import google >>> google.setLicense('...') # must get your own key! >>> data = google.doGoogleSearch('python') >>> data.meta.searchTime 0.043221000000000002 >>> data.results[0].URL 'http://www.python.org/' >>> data.results[0].title 'Python Language Website' @newfield contrib: Contributors @author: Mark Pilgrim @author: Brian Landers @license: Python @version: 0.6 @contrib: David Ascher, for the install script @contrib: Erik Max Francis, for the command line interface @contrib: Michael Twomey, for HTTP proxy support @contrib: Mark Recht, for patches to support SOAPpy """ __author__ = "Mark Pilgrim (f8dy@diveintomark.org)" __version__ = "0.6" __cvsversion__ = "$Revision: 1.5 $"[11:-2] __date__ = "$Date: 2004/02/25 23:46:07 $"[7:-2] __copyright__ = "Copyright (c) 2002 Mark Pilgrim" __license__ = "Python" __credits__ = """David Ascher, for the install script Erik Max Francis, for the command line interface Michael Twomey, for HTTP proxy support""" import os, sys, getopt import GoogleSOAPFacade LICENSE_KEY = None HTTP_PROXY = None # # Constants # _url = 'http://api.google.com/search/beta2' _namespace = 'urn:GoogleSearch' _googlefile1 = ".googlekey" _googlefile2 = "googlekey.txt" _false = GoogleSOAPFacade.false _true = GoogleSOAPFacade.true _licenseLocations = ( ( lambda key: key, 'passed to the function in license_key variable' ), ( lambda key: LICENSE_KEY, 'module-level LICENSE_KEY variable (call setLicense to set it)' ), ( lambda key: os.environ.get( 'GOOGLE_LICENSE_KEY', None ), 'an environment variable called GOOGLE_LICENSE_KEY' ), ( lambda key: _contentsOf( os.getcwd(), _googlefile1 ), '%s in the current directory' % _googlefile1), ( lambda key: _contentsOf( os.getcwd(), _googlefile2 ), '%s in the current directory' % _googlefile2), ( lambda key: _contentsOf( os.environ.get( 'HOME', '' ), _googlefile1 ), '%s in your home directory' % _googlefile1), ( lambda key: _contentsOf( os.environ.get( 'HOME', '' ), _googlefile2 ), '%s in your home directory' % _googlefile2 ), ( lambda key: _contentsOf( _getScriptDir(), _googlefile1 ), '%s in the google.py directory' % _googlefile1 ), ( lambda key: _contentsOf( _getScriptDir(), _googlefile2 ), '%s in the google.py directory' % _googlefile2 ) ) ## ---------------------------------------------------------------------- ## Exceptions ## ---------------------------------------------------------------------- class NoLicenseKey(Exception): """ Thrown when the API is unable to find a valid license key. """ pass ## ---------------------------------------------------------------------- ## administrative functions (non-API) ## ---------------------------------------------------------------------- def _version(): """ Display a formatted version string for the module """ print """PyGoogle %(__version__)s %(__copyright__)s released %(__date__)s Thanks to: %(__credits__)s""" % globals() def _usage(): """ Display usage information for the command-line interface """ program = os.path.basename(sys.argv[0]) print """Usage: %(program)s [options] [querytype] query options: -k, --key= Google license key (see important note below) -1, -l, --lucky show only first hit -m, --meta show meta information -r, --reverse show results in reverse order -x, --proxy= use HTTP proxy -h, --help print this help -v, --version print version and copyright information -t, --test run test queries querytype: -s, --search= search (default) -c, --cache= retrieve cached page -p, --spelling= check spelling IMPORTANT NOTE: all Google functions require a valid license key; visit http://www.google.com/apis/ to get one. %(program)s will look in these places (in order) and use the first license key it finds: * the key specified on the command line""" % vars() for get, location in _licenseLocations[2:]: print " *", location ## ---------------------------------------------------------------------- ## utility functions (API) ## ---------------------------------------------------------------------- def setLicense(license_key): """ Set the U{Google APIs } license key @param license_key: The new key to use @type license_key: String @todo: validate the key? """ global LICENSE_KEY LICENSE_KEY = license_key def getLicense(license_key = None): """ Get the U{Google APIs } license key The key can be read from any number of locations. See the module-leve documentation for the search order. @return: the license key @rtype: String @raise NoLicenseKey: if no valid key could be found """ for get, location in _licenseLocations: rc = get(license_key) if rc: return rc _usage() raise NoLicenseKey, 'get a license key at http://www.google.com/apis/' def setProxy(http_proxy): """ Set the HTTP proxy to be used when accessing Google @param http_proxy: the proxy to use @type http_proxy: String @todo: validiate the input? """ global HTTP_PROXY HTTP_PROXY = http_proxy def getProxy(http_proxy = None): """ Get the HTTP proxy we use for accessing Google @return: the proxy @rtype: String """ return http_proxy or HTTP_PROXY def _contentsOf(dirname, filename): filename = os.path.join(dirname, filename) if not os.path.exists(filename): return None fsock = open(filename) contents = fsock.read() fsock.close() return contents def _getScriptDir(): if __name__ == '__main__': return os.path.abspath(os.path.dirname(sys.argv[0])) else: return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__)) def _marshalBoolean(value): if value: return _true else: return _false def _getRemoteServer( http_proxy ): return GoogleSOAPFacade.getProxy( _url, _namespace, http_proxy ) ## ---------------------------------------------------------------------- ## search results classes ## ---------------------------------------------------------------------- class _SearchBase: def __init__(self, params): for k, v in params.items(): if isinstance(v, GoogleSOAPFacade.structType): v = GoogleSOAPFacade.toDict( v ) try: if isinstance(v[0], GoogleSOAPFacade.structType): v = [ SOAPProxy.toDict( node ) for node in v ] except: pass self.__dict__[str(k)] = v ## ---------------------------------------------------------------------- class SearchResultsMetaData(_SearchBase): """ Container class for metadata about a given search query's results. @ivar documentFiltering: is duplicate page filtering active? @ivar searchComments: human-readable informational message example:: "'the' is a very common word and was not included in your search" @ivar estimatedTotalResultsCount: estimated total number of results for this query. @ivar estimateIsExact: is estimatedTotalResultsCount an exact value? @ivar searchQuery: search string that initiated this search @ivar startIndex: index of the first result returned (zero-based) @ivar endIndex: index of the last result returned (zero-based) @ivar searchTips: human-readable informational message on how to better use Google. @ivar directoryCategories: list of categories for the search results This field is a list of dictionaries, like so:: { 'fullViewableName': 'the Open Directory category', 'specialEncoding': 'encoding scheme of this directory category' } @ivar searchTime: total search time, in seconds """ pass ## ---------------------------------------------------------------------- class SearchResult(_SearchBase): """ Encapsulates the results from a search. @ivar URL: URL @ivar title: title (HTML) @ivar snippet: snippet showing query context (HTML @ivar cachedSize: size of cached version of this result, (KB) @ivar relatedInformationPresent: is the "related:" keyword supported? Flag indicates that the "related:" keyword is supported for this URL @ivar hostName: used when filtering occurs When filtering occurs, a maximum of two results from any given host is returned. When this occurs, the second resultElement that comes from that host contains the host name in this parameter. @ivar directoryCategory: Open Directory category information This field is a dictionary with the following values:: { 'fullViewableName': 'the Open Directory category', 'specialEncoding' : 'encoding scheme of this directory category' } @ivar directoryTitle: Open Directory title of this result (or blank) @ivar summary: Open Directory summary for this result (or blank) """ pass ## ---------------------------------------------------------------------- class SearchReturnValue: """ complete search results for a single query @ivar meta: L{SearchResultsMetaData} instance for this query @ivar results: list of L{SearchResult} objects for this query """ def __init__( self, metadata, results ): self.meta = metadata self.results = results ## ---------------------------------------------------------------------- ## main functions ## ---------------------------------------------------------------------- def doGoogleSearch( q, start = 0, maxResults = 10, filter = 1, restrict='', safeSearch = 0, language = '', inputencoding = '', outputencoding = '',\ license_key = None, http_proxy = None ): """ Search Google using the SOAP API and return the results. You need a license key to call this function; see the U{Google APIs } site to get one. Then you can either pass it to this function every time, or set it globally; see the L{google} module-level docs for details. See U{http://www.google.com/help/features.html} for examples of advanced features. Anything that works at the Google web site will work as a query string in this method. You can use the C{start} and C{maxResults} parameters to page through multiple pages of results. Note that 'maxResults' is currently limited by Google to 10. See the API reference for more advanced examples and a full list of country codes and topics for use in the C{restrict} parameter, along with legal values for the C{language}, C{inputencoding}, and C{outputencoding} parameters. You can download the API documentation U{http://www.google.com/apis/download.html }. @param q: search string. @type q: String @param start: (optional) zero-based index of first desired result. @type start: int @param maxResults: (optional) maximum number of results to return. @type maxResults: int @param filter: (optional) flag to request filtering of similar results @type filter: int @param restrict: (optional) restrict results by country or topic. @type restrict: String @param safeSearch: (optional) @type safeSearch: int @param language: (optional) @type language: String @param inputencoding: (optional) @type inputencoding: String @param outputencoding: (optional) @type outputencoding: String @param license_key: (optional) the Google API license key to use @type license_key: String @param http_proxy: (optional) the HTTP proxy to use for talking to Google @type http_proxy: String @return: the search results encapsulated in an object @rtype: L{SearchReturnValue} """ license_key = getLicense( license_key ) http_proxy = getProxy( http_proxy ) remoteserver = _getRemoteServer( http_proxy ) filter = _marshalBoolean( filter ) safeSearch = _marshalBoolean( safeSearch ) data = remoteserver.doGoogleSearch( license_key, q, start, maxResults, filter, restrict, safeSearch, language, inputencoding, outputencoding ) metadata = GoogleSOAPFacade.toDict( data ) del metadata["resultElements"] metadata = SearchResultsMetaData( metadata ) results = [ SearchResult( GoogleSOAPFacade.toDict( node ) ) \ for node in data.resultElements ] return SearchReturnValue( metadata, results ) ## ---------------------------------------------------------------------- def doGetCachedPage( url, license_key = None, http_proxy = None ): """ Retrieve a page from the Google cache. You need a license key to call this function; see the U{Google APIs } site to get one. Then you can either pass it to this function every time, or set it globally; see the L{google} module-level docs for details. @param url: full URL to the page to retrieve @type url: String @param license_key: (optional) the Google API key to use @type license_key: String @param http_proxy: (optional) the HTTP proxy server to use @type http_proxy: String @return: full text of the cached page @rtype: String """ license_key = getLicense( license_key ) http_proxy = getProxy( http_proxy ) remoteserver = _getRemoteServer( http_proxy ) return remoteserver.doGetCachedPage( license_key, url ) ## ---------------------------------------------------------------------- def doSpellingSuggestion( phrase, license_key = None, http_proxy = None ): """ Get spelling suggestions from Google You need a license key to call this function; see the U{Google APIs } site to get one. Then you can either pass it to this function every time, or set it globally; see the L{google} module-level docs for details. @param phrase: word or phrase to spell-check @type phrase: String @param license_key: (optional) the Google API key to use @type license_key: String @param http_proxy: (optional) the HTTP proxy to use @type http_proxy: String @return: text of any suggested replacement, or None """ license_key = getLicense( license_key ) http_proxy = getProxy( http_proxy) remoteserver = _getRemoteServer( http_proxy ) return remoteserver.doSpellingSuggestion( license_key, phrase ) ## ---------------------------------------------------------------------- ## functional test suite (see googletest.py for unit test suite) ## ---------------------------------------------------------------------- def _test(): """ Run functional test suite. """ try: getLicense(None) except NoLicenseKey: return print "Searching for Python at google.com..." data = doGoogleSearch( "Python" ) _output( data, { "func": "doGoogleSearch"} ) print "\nSearching for 5 _French_ pages about Python, " print "encoded in ISO-8859-1..." data = doGoogleSearch( "Python", language = 'lang_fr', outputencoding = 'ISO-8859-1', maxResults = 5 ) _output( data, { "func": "doGoogleSearch" } ) phrase = "Pyhton programming languager" print "\nTesting spelling suggestions for '%s'..." % phrase data = doSpellingSuggestion( phrase ) _output( data, { "func": "doSpellingSuggestion" } ) ## ---------------------------------------------------------------------- ## Command-line interface ## ---------------------------------------------------------------------- class _OutputFormatter: def boil(self, data): if type(data) == type(u""): return data.encode("ISO-8859-1", "replace") else: return data class _TextOutputFormatter(_OutputFormatter): def common(self, data, params): if params.get("showMeta", 0): meta = data.meta for category in meta.directoryCategories: print "directoryCategory: %s" % \ self.boil(category["fullViewableName"]) for attr in [node for node in dir(meta) if \ node <> "directoryCategories" and node[:2] <> '__']: print "%s:" % attr, self.boil(getattr(meta, attr)) def doGoogleSearch(self, data, params): results = data.results if params.get("feelingLucky", 0): results = results[:1] if params.get("reverseOrder", 0): results.reverse() for result in results: for attr in dir(result): if attr == "directoryCategory": print "directoryCategory:", \ self.boil(result.directoryCategory["fullViewableName"]) elif attr[:2] <> '__': print "%s:" % attr, self.boil(getattr(result, attr)) print self.common(data, params) def doGetCachedPage(self, data, params): print data self.common(data, params) doSpellingSuggestion = doGetCachedPage def _makeFormatter(outputFormat): classname = "_%sOutputFormatter" % outputFormat.capitalize() return globals()[classname]() def _output(results, params): formatter = _makeFormatter(params.get("outputFormat", "text")) outputmethod = getattr(formatter, params["func"]) outputmethod(results, params) def main(argv): """ Command-line interface. """ if not argv: _usage() return q = None func = None http_proxy = None license_key = None feelingLucky = 0 showMeta = 0 reverseOrder = 0 runTest = 0 outputFormat = "text" try: opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1", ["search=", "cache=", "spelling=", "key=", "lucky", "meta", "reverse", "proxy=", "help", "version", "test"]) except getopt.GetoptError: _usage() sys.exit(2) for opt, arg in opts: if opt in ("-s", "--search"): q = arg func = "doGoogleSearch" elif opt in ("-c", "--cache"): q = arg func = "doGetCachedPage" elif opt in ("-p", "--spelling"): q = arg func = "doSpellingSuggestion" elif opt in ("-k", "--key"): license_key = arg elif opt in ("-l", "-1", "--lucky"): feelingLucky = 1 elif opt in ("-m", "--meta"): showMeta = 1 elif opt in ("-r", "--reverse"): reverseOrder = 1 elif opt in ("-x", "--proxy"): http_proxy = arg elif opt in ("-h", "--help"): _usage() elif opt in ("-v", "--version"): _version() elif opt in ("-t", "--test"): runTest = 1 if runTest: setLicense(license_key) setProxy(http_proxy) _test() if args and not q: q = args[0] func = "doGoogleSearch" if func: results = globals()[func]( q, http_proxy=http_proxy, license_key=license_key ) _output(results, locals()) if __name__ == '__main__': main(sys.argv[1:])