Web   ·   Wiki   ·   Activities   ·   Blog   ·   Lists   ·   Chat   ·   Meeting   ·   Bugs   ·   Git   ·   Translate   ·   Archive   ·   People   ·   Donate
summaryrefslogtreecommitdiffstats
path: root/shell/google/google.py
blob: a20ba513af779af601e3a49039b3ca01e3f4b4e6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
"""
Python wrapper for Google web APIs

This module allows you to access Google's web APIs through SOAP,
to do things like search Google and get the results programmatically.
Described U{here <http://www.google.com/apis/>}
  
You need a Google-provided license key to use these services.
Follow the link above to get one.  These functions will look in
several places (in this order) for the license key:

    - the "license_key" argument of each function
    - the module-level LICENSE_KEY variable (call setLicense once to set it)
    - an environment variable called GOOGLE_LICENSE_KEY
    - a file called ".googlekey" in the current directory
    - a file called "googlekey.txt" in the current directory
    - a file called ".googlekey" in your home directory
    - a file called "googlekey.txt" in your home directory
    - a file called ".googlekey" in the same directory as google.py
    - a file called "googlekey.txt" in the same directory as google.py

Sample usage::
    
    >>> import google
    >>> google.setLicense('...') # must get your own key!
    >>> data = google.doGoogleSearch('python')
    >>> data.meta.searchTime
    0.043221000000000002
    
    >>> data.results[0].URL
    'http://www.python.org/'
    
    >>> data.results[0].title
    '<b>Python</b> Language Website'

@newfield contrib: Contributors
@author:   Mark Pilgrim <f8dy@diveintomark.org>
@author:   Brian Landers <brian@bluecoat93.org>
@license:  Python
@version:  0.6
@contrib:  David Ascher, for the install script
@contrib:  Erik Max Francis, for the command line interface
@contrib:  Michael Twomey, for HTTP proxy support
@contrib:  Mark Recht, for patches to support SOAPpy
"""

__author__ = "Mark Pilgrim (f8dy@diveintomark.org)"
__version__ = "0.6"
__cvsversion__ = "$Revision: 1.5 $"[11:-2]
__date__ = "$Date: 2004/02/25 23:46:07 $"[7:-2]
__copyright__ = "Copyright (c) 2002 Mark Pilgrim"
__license__ = "Python"
__credits__ = """David Ascher, for the install script
Erik Max Francis, for the command line interface
Michael Twomey, for HTTP proxy support"""

import os, sys, getopt
import GoogleSOAPFacade

LICENSE_KEY = None
HTTP_PROXY  = None

#
# Constants
#
_url         = 'http://api.google.com/search/beta2'
_namespace   = 'urn:GoogleSearch'
_googlefile1 = ".googlekey"
_googlefile2 = "googlekey.txt"

_false = GoogleSOAPFacade.false
_true  = GoogleSOAPFacade.true

_licenseLocations = (
    ( lambda key: key,
      'passed to the function in license_key variable' ),
    ( lambda key: LICENSE_KEY, 
      'module-level LICENSE_KEY variable (call setLicense to set it)' ),
    ( lambda key: os.environ.get( 'GOOGLE_LICENSE_KEY', None ),
      'an environment variable called GOOGLE_LICENSE_KEY' ),
    ( lambda key: _contentsOf( os.getcwd(), _googlefile1 ), 
      '%s in the current directory' % _googlefile1),
    ( lambda key: _contentsOf( os.getcwd(), _googlefile2 ),
      '%s in the current directory' % _googlefile2),
    ( lambda key: _contentsOf( os.environ.get( 'HOME', '' ), _googlefile1 ),
      '%s in your home directory' % _googlefile1),
    ( lambda key: _contentsOf( os.environ.get( 'HOME', '' ), _googlefile2 ),
      '%s in your home directory' % _googlefile2 ),
    ( lambda key: _contentsOf( _getScriptDir(), _googlefile1 ),
      '%s in the google.py directory' % _googlefile1 ),
    ( lambda key: _contentsOf( _getScriptDir(), _googlefile2 ),
      '%s in the google.py directory' % _googlefile2 )
)

## ----------------------------------------------------------------------
## Exceptions
## ----------------------------------------------------------------------

class NoLicenseKey(Exception): 
    """
    Thrown when the API is unable to find a valid license key.
    """
    pass

## ----------------------------------------------------------------------
## administrative functions (non-API)
## ----------------------------------------------------------------------

def _version():
    """
    Display a formatted version string for the module
    """
    print """PyGoogle %(__version__)s
%(__copyright__)s
released %(__date__)s

Thanks to:
%(__credits__)s""" % globals()

    
def _usage():
    """
    Display usage information for the command-line interface
    """
    program = os.path.basename(sys.argv[0])
    print """Usage: %(program)s [options] [querytype] query

options:
  -k, --key= <license key> Google license key (see important note below)
  -1, -l, --lucky          show only first hit
  -m, --meta               show meta information
  -r, --reverse            show results in reverse order
  -x, --proxy= <url>       use HTTP proxy
  -h, --help               print this help
  -v, --version            print version and copyright information
  -t, --test               run test queries

querytype:
  -s, --search= <query>    search (default)
  -c, --cache= <url>       retrieve cached page
  -p, --spelling= <word>   check spelling

IMPORTANT NOTE: all Google functions require a valid license key;
visit http://www.google.com/apis/ to get one.  %(program)s will look in
these places (in order) and use the first license key it finds:
  * the key specified on the command line""" % vars()
    for get, location in _licenseLocations[2:]:
        print "  *", location

## ----------------------------------------------------------------------
## utility functions (API)
## ----------------------------------------------------------------------

def setLicense(license_key):
    """
    Set the U{Google APIs <http://www.google.com/api>} license key
    
    @param license_key: The new key to use
    @type  license_key: String
    @todo: validate the key?
    """
    global LICENSE_KEY
    LICENSE_KEY = license_key
    
    
def getLicense(license_key = None):
    """
    Get the U{Google APIs <http://www.google.com/api>} license key
    
    The key can be read from any number of locations.  See the module-leve
    documentation for the search order.
    
    @return: the license key
    @rtype:  String
    @raise NoLicenseKey: if no valid key could be found
    """
    for get, location in _licenseLocations:
        rc = get(license_key)
        if rc: return rc
    _usage()
    raise NoLicenseKey, 'get a license key at http://www.google.com/apis/'


def setProxy(http_proxy):
    """
    Set the HTTP proxy to be used when accessing Google
    
    @param http_proxy: the proxy to use
    @type  http_proxy: String
    @todo: validiate the input?
    """
    global HTTP_PROXY
    HTTP_PROXY = http_proxy


def getProxy(http_proxy = None):
    """
    Get the HTTP proxy we use for accessing Google
    
    @return: the proxy
    @rtype:  String
    """
    return http_proxy or HTTP_PROXY


def _contentsOf(dirname, filename):
    filename = os.path.join(dirname, filename)
    if not os.path.exists(filename): return None
    fsock = open(filename)
    contents = fsock.read()
    fsock.close()
    return contents


def _getScriptDir():
    if __name__ == '__main__':
        return os.path.abspath(os.path.dirname(sys.argv[0]))
    else:
        return os.path.abspath(os.path.dirname(sys.modules[__name__].__file__))


def _marshalBoolean(value):
    if value:
        return _true
    else:
        return _false


def _getRemoteServer( http_proxy ):
    return GoogleSOAPFacade.getProxy( _url, _namespace, http_proxy )
    

## ----------------------------------------------------------------------
## search results classes
## ----------------------------------------------------------------------

class _SearchBase:
    def __init__(self, params):
        for k, v in params.items():
            if isinstance(v, GoogleSOAPFacade.structType):
                v = GoogleSOAPFacade.toDict( v )
                
            try:
                if isinstance(v[0], GoogleSOAPFacade.structType):
                    v = [ SOAPProxy.toDict( node ) for node in v ]

            except:
                pass
            self.__dict__[str(k)] = v

## ----------------------------------------------------------------------

class SearchResultsMetaData(_SearchBase):
    """
    Container class for metadata about a given search query's results.

    @ivar documentFiltering: is duplicate page filtering active?

    @ivar searchComments: human-readable informational message

        example::

             "'the' is a very common word and was not included in your search"

    @ivar estimatedTotalResultsCount: estimated total number of results 
        for this query.

    @ivar estimateIsExact: is estimatedTotalResultsCount an exact value?

    @ivar searchQuery: search string that initiated this search

    @ivar startIndex: index of the first result returned (zero-based)

    @ivar endIndex: index of the last result returned (zero-based)

    @ivar searchTips: human-readable informational message on how to better
       use Google.

    @ivar directoryCategories: list of categories for the search results

       This field is a list of dictionaries, like so::

           { 'fullViewableName': 'the Open Directory category',
             'specialEncoding':  'encoding scheme of this directory category'
           }

    @ivar searchTime: total search time, in seconds
    """    
    pass

## ----------------------------------------------------------------------

class SearchResult(_SearchBase):
    """
    Encapsulates the results from a search.

    @ivar URL: URL

    @ivar title: title (HTML)

    @ivar snippet: snippet showing query context (HTML

    @ivar cachedSize: size of cached version of this result, (KB)

    @ivar relatedInformationPresent: is the "related:" keyword supported?

        Flag indicates that the "related:" keyword is supported for this URL

    @ivar hostName:  used when filtering occurs

        When filtering occurs, a maximum of two results from any given
        host is returned.  When this occurs, the second resultElement
        that comes from that host contains the host name in this parameter.

    @ivar directoryCategory: Open Directory category information

        This field is a dictionary with the following values::

            { 'fullViewableName': 'the Open Directory category',
              'specialEncoding' : 'encoding scheme of this directory category'
            }

    @ivar directoryTitle: Open Directory title of this result (or blank)

    @ivar summary: Open Directory summary for this result (or blank)
    """
    pass

## ----------------------------------------------------------------------

class SearchReturnValue:
    """
    complete search results for a single query

    @ivar meta: L{SearchResultsMetaData} instance for this query

    @ivar results: list of L{SearchResult} objects for this query 
    """
    def __init__( self, metadata, results ):
        self.meta    = metadata
        self.results = results

## ----------------------------------------------------------------------
## main functions
## ----------------------------------------------------------------------

def doGoogleSearch( q, start = 0, maxResults = 10, filter = 1,
                    restrict='', safeSearch = 0, language = '',
                    inputencoding = '', outputencoding = '',\
                    license_key = None, http_proxy = None ):
    """
    Search Google using the SOAP API and return the results.

    You need a license key to call this function; see the
    U{Google APIs <http://www.google.com/apis/>} site to get one.
    Then you can either pass it to this function every time, or
    set it globally; see the L{google} module-level docs for details.
    
    See U{http://www.google.com/help/features.html}
    for examples of advanced features.  Anything that works at the 
    Google web site will work as a query string in this method.
    
    You can use the C{start} and C{maxResults} parameters to page
    through multiple pages of results.  Note that 'maxResults' is
    currently limited by Google to 10.
            
    See the API reference for more advanced examples and a full list of
    country codes and topics for use in the C{restrict} parameter, along
    with legal values for the C{language}, C{inputencoding}, and
    C{outputencoding} parameters.
    
    You can download the API documentation 
    U{http://www.google.com/apis/download.html <here>}.
    
    @param q: search string.  
    @type  q: String

    @param start: (optional) zero-based index of first desired result.
    @type  start: int

    @param maxResults: (optional) maximum number of results to return.
    @type  maxResults: int

    @param filter: (optional) flag to request filtering of similar results
    @type  filter: int

    @param restrict: (optional) restrict results by country or topic.
    @type  restrict: String    

    @param safeSearch: (optional)
    @type  safeSearch: int

    @param language: (optional)
    @type  language: String

    @param inputencoding: (optional)
    @type  inputencoding: String

    @param outputencoding: (optional)
    @type  outputencoding: String

    @param license_key: (optional) the Google API license key to use
    @type  license_key: String

    @param http_proxy: (optional) the HTTP proxy to use for talking to Google
    @type  http_proxy: String
    
    @return: the search results encapsulated in an object
    @rtype:  L{SearchReturnValue}
    """
    license_key  = getLicense( license_key )    
    http_proxy   = getProxy( http_proxy )
    remoteserver = _getRemoteServer( http_proxy )
                                   
    filter     = _marshalBoolean( filter )
    safeSearch = _marshalBoolean( safeSearch )
    
    data = remoteserver.doGoogleSearch( license_key, q, start, maxResults,
                                        filter, restrict, safeSearch,
                                        language, inputencoding, 
                                        outputencoding )

    metadata = GoogleSOAPFacade.toDict( data )
    del metadata["resultElements"]
    
    metadata = SearchResultsMetaData( metadata )
    
    results = [ SearchResult( GoogleSOAPFacade.toDict( node ) ) \
                    for node in data.resultElements ]
    
    return SearchReturnValue( metadata, results )

## ----------------------------------------------------------------------

def doGetCachedPage( url, license_key = None, http_proxy = None ):
    """
    Retrieve a page from the Google cache.

    You need a license key to call this function; see the
    U{Google APIs <http://www.google.com/apis/>} site to get one.
    Then you can either pass it to this function every time, or
    set it globally; see the L{google} module-level docs for details.
    
    @param url: full URL to the page to retrieve
    @type  url: String
    
    @param license_key: (optional) the Google API key to use
    @type  license_key: String
    
    @param http_proxy:  (optional) the HTTP proxy server to use
    @type  http_proxy:  String
    
    @return: full text of the cached page
    @rtype:  String
    """
    license_key  = getLicense( license_key )
    http_proxy   = getProxy( http_proxy )
    remoteserver = _getRemoteServer( http_proxy )
                                   
    return remoteserver.doGetCachedPage( license_key, url )

## ----------------------------------------------------------------------

def doSpellingSuggestion( phrase, license_key = None, http_proxy = None ):
    """
    Get spelling suggestions from Google

    You need a license key to call this function; see the
    U{Google APIs <http://www.google.com/apis/>} site to get one.
    Then you can either pass it to this function every time, or
    set it globally; see the L{google} module-level docs for details.

    @param phrase: word or phrase to spell-check
    @type  phrase: String

    @param license_key: (optional) the Google API key to use
    @type  license_key: String
    
    @param http_proxy: (optional) the HTTP proxy to use
    @type  http_proxy: String
    
    @return: text of any suggested replacement, or None
    """
    license_key  = getLicense( license_key )    
    http_proxy   = getProxy( http_proxy) 
    remoteserver = _getRemoteServer( http_proxy )
                                   
    return remoteserver.doSpellingSuggestion( license_key, phrase )

## ----------------------------------------------------------------------
## functional test suite (see googletest.py for unit test suite)
## ----------------------------------------------------------------------

def _test():
    """
    Run functional test suite.
    """
    try:
        getLicense(None)
    except NoLicenseKey:
        return
        
    print "Searching for Python at google.com..."
    data = doGoogleSearch( "Python" )
    _output( data, { "func": "doGoogleSearch"} )

    print "\nSearching for 5 _French_ pages about Python, "
    print "encoded in ISO-8859-1..."

    data = doGoogleSearch( "Python", language = 'lang_fr',                 
                                     outputencoding = 'ISO-8859-1',
                                     maxResults = 5 )
                                     
    _output( data, { "func": "doGoogleSearch" } )

    phrase = "Pyhton programming languager"
    print "\nTesting spelling suggestions for '%s'..." % phrase
    
    data = doSpellingSuggestion( phrase )
    
    _output( data, { "func": "doSpellingSuggestion" } )

## ----------------------------------------------------------------------
## Command-line interface
## ----------------------------------------------------------------------

class _OutputFormatter:
    def boil(self, data):
        if type(data) == type(u""):
            return data.encode("ISO-8859-1", "replace")
        else:
            return data

class _TextOutputFormatter(_OutputFormatter):
    def common(self, data, params):
        if params.get("showMeta", 0):
            meta = data.meta
            for category in meta.directoryCategories:
                print "directoryCategory: %s" % \
                  self.boil(category["fullViewableName"])
            for attr in [node for node in dir(meta) if \
              node <> "directoryCategories" and node[:2] <> '__']:
                print "%s:" % attr, self.boil(getattr(meta, attr))
        
    def doGoogleSearch(self, data, params):
        results = data.results
        if params.get("feelingLucky", 0):
            results = results[:1]
        if params.get("reverseOrder", 0):
            results.reverse()
        for result in results:
            for attr in dir(result):
                if attr == "directoryCategory":
                    print "directoryCategory:", \
                      self.boil(result.directoryCategory["fullViewableName"])
                elif attr[:2] <> '__':
                    print "%s:" % attr, self.boil(getattr(result, attr))
            print
        self.common(data, params)
    
    def doGetCachedPage(self, data, params):
        print data
        self.common(data, params)

    doSpellingSuggestion = doGetCachedPage

def _makeFormatter(outputFormat):
    classname = "_%sOutputFormatter" % outputFormat.capitalize()
    return globals()[classname]()

def _output(results, params):
    formatter = _makeFormatter(params.get("outputFormat", "text"))
    outputmethod = getattr(formatter, params["func"])
    outputmethod(results, params)

def main(argv):
    """
    Command-line interface.
    """
    if not argv:
        _usage()
        return
    q = None
    func = None
    http_proxy = None
    license_key = None
    feelingLucky = 0
    showMeta = 0
    reverseOrder = 0
    runTest = 0
    outputFormat = "text"
    try:
        opts, args = getopt.getopt(argv, "s:c:p:k:lmrx:hvt1",
            ["search=", "cache=", "spelling=", "key=", "lucky", "meta",
             "reverse", "proxy=", "help", "version", "test"])
    except getopt.GetoptError:
        _usage()
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-s", "--search"):
            q = arg
            func = "doGoogleSearch"
        elif opt in ("-c", "--cache"):
            q = arg
            func = "doGetCachedPage"
        elif opt in ("-p", "--spelling"):
            q = arg
            func = "doSpellingSuggestion"
        elif opt in ("-k", "--key"):
            license_key = arg
        elif opt in ("-l", "-1", "--lucky"):
            feelingLucky = 1
        elif opt in ("-m", "--meta"):
            showMeta = 1
        elif opt in ("-r", "--reverse"):
            reverseOrder = 1
        elif opt in ("-x", "--proxy"):
            http_proxy = arg
        elif opt in ("-h", "--help"):
            _usage()
        elif opt in ("-v", "--version"):
            _version()
        elif opt in ("-t", "--test"):
            runTest = 1
    if runTest:
        setLicense(license_key)
        setProxy(http_proxy)
        _test()
    if args and not q:
        q = args[0]
        func = "doGoogleSearch"
    if func:
        results = globals()[func]( q, http_proxy=http_proxy, 
                                   license_key=license_key )
        _output(results, locals())

if __name__ == '__main__':
    main(sys.argv[1:])