diff options
author | Dinko Galetic <dgaletic@everflame.(none)> | 2010-06-03 07:43:27 (GMT) |
---|---|---|
committer | Dinko Galetic <dgaletic@everflame.(none)> | 2010-06-03 07:43:27 (GMT) |
commit | 313518d63ff2f167069097fbb9f621ca8a6c62fa (patch) | |
tree | 5a73bd18cf61f0e0a99371794790214bf97bfed5 | |
parent | b42aaca7f79dcd31b349791e1a13183758bf59a9 (diff) |
Added a Python script which demonstrates the basic use of urllib2.
It opens a few sites, reads their content and locates their titles in that content.
-rw-r--r-- | data/GSOC examples/Opening websites | 64 |
1 files changed, 64 insertions, 0 deletions
diff --git a/data/GSOC examples/Opening websites b/data/GSOC examples/Opening websites new file mode 100644 index 0000000..97ee16f --- /dev/null +++ b/data/GSOC examples/Opening websites @@ -0,0 +1,64 @@ +# This example demonstrates how urllib2 can be used to open websites and read +# some data from them. + +import urllib2 + +# define a function which will open a bunch of links we give it in a list +def open_sites(links): + sites = [] + for url in urls: + print "Opening: " + url + # try to open that site + try: + site = urllib2.urlopen(url) + except: + # Does an error occur with any of the default urls? + # Practice: If so, could you fix it? + print "An error has occured, skipping " + url + print + raw_input("...press enter key to continue...") + continue + if site.geturl() != url: + print "Careful! Site " + url + " has redirected you to " + site.geturl() + print "Site " + site.geturl() + " is now open." + print + sites.append(site) + raw_input("...press enter key to continue...") + print + return sites + +url1 = "http://www.google.com" +url2 = "http://www.sugarlabs.org" +url3 = "www.wikipedia.org" +urls = [url1, url2, url3] + +sites = open_sites(urls) + +print +print "Let's read those sites and find their titles." +print +raw_input("...press enter key to continue...") +print + +for site in sites: + site_content = site.read() + title_at = site_content.find("<title>") + 7 + print "The title of site at " + site.geturl() + " begins at its index " + str(title_at) + title_ends = site_content.find("</title>", title_at) + title = site_content[title_at:title_ends] + # In Python, \ is the so-called "escape" character. Since some characters have + # special meanings, like " or ' opening and closing a string, we have to tell + # the interpreter to ignore such meanings when we wish to put those precise + # characters in a string (or print them). In the following line, we wish to + # print the " character so we "escape" it - by putting \ in before it. + # Practice: What would we have to do to print an escape character \ ? + print "The title is: \"" + title + "\"" + print + # An index of -1 refers to the first element from the end. Thus, this + # comparison checks whether the current element is the last one. + # Practice: Why would we want that? + if site == sites[-1]: + raw_input("...press enter to finish..:") + else: + raw_input("...press enter key to continue...") + print |