Wednesday, October 7, 2009

link extractor in python

I my engineering, I coded a python script that will extract links from a web page.
Here is the code...


import urllib
import sys
import os.path
import sgmllib


print "\n\n\t\tlipun4u[at]gmail[dot]com"
print "\t\t------------------------"

appname = os.path.basename(sys.argv[0])

class MyParser(sgmllib.SGMLParser):
"A simple parser class."

def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()

def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."

sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []

def start_a(self, attributes):
"Process a hyperlink and its 'attributes'."

for name, value in attributes:
if name == "href":
self.hyperlinks.append(value)

def get_hyperlinks(self):
"Return the list of hyperlinks."

return self.hyperlinks



if len(sys.argv) not in [2,]:
print "Usage : " + appname + " "
print "e.g. : " + appname + " www.google.com "
sys.exit(1)
elif "-h" in sys.argv:
print "Usage : " + appname + " "
print "e.g. : " + appname + " www.google.com "
sys.exit(1)
elif "--help" in sys.argv:
print "Usage : " + appname + " "
print "e.g. : " + appname + " www.google.com "
sys.exit(1)



site = sys.argv[1].replace("http://","")
site = "http://" + site.lower()

print "Target : " + site
try:
site_data = urllib.urlopen(site)
parser = MyParser()
parser.parse(site_data.read())
except(IOError),msg:
print "Error in connecting site ", site
print msg
sys.exit(1)
links = parser.get_hyperlinks()
print "Total no. of hyperlinks : " + str(len(links))
print ""
for l in links:
print l


Here is the help file

 

I:\Python26>linkscan1.py


lipun4u[at]gmail[dot]com
------------------------
Usage : linkscan1.py
e.g. : linkscan1.py www.google.com

I:\Python26>linkscan1.py www.iter.ac.in


lipun4u[at]gmail[dot]com
------------------------
Target : http://www.iter.ac.in
Total no. of hyperlinks : 12

http://iter.ac.in
default.asp
contactus.asp
http://iter.ac.in:8383
time-table.xls
http://www.soauniversity.ac.in/saat_2009.htm
images/advertisement_Saat2009.gif
#
#
#
#
http://www.allindiaonline.in/

I:\Python26>


But some guys added some spice to it and look what they made...


No comments:

Post a Comment