Linkchecker.py

From OLPC
Revision as of 16:27, 29 November 2007 by Crazy-chris (talk | contribs) (+OUT)
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

This is an example of a recursing linkchecker.

python linkchecker.py www.laptop.org
  • stays inside laptop.org
  • status:
    • OK ... all fine
    • OUT .. link out of domain
    • 404 .. not found
    • SRY .. https


linkchecker.py

#! /usr/bin/env python
import sys
import urllib2

history = []

def get_content(get_url):
	f = False
	try: f = urllib2.urlopen(get_url).read()
	except urllib2.HTTPError, (a): pass
  	return f

def check_content(url, level = 0):
	spaces = level
	
	# 1. Do we like the URL?
	if len(url) == 0: 
		level -= 1
		return 1
	
	if url[:6] == 'mailto':
		level -= 1;
		return 1
		
	if url[0] == '/': 
		url = "http://" + base_domain + url

	elif url[:8] == 'https://':
		print spaces, "- SRY:", url
		level -= 1
		return 1

	elif url[:7] == 'http://':
		if url[:80].count(base_domain_nowww) == 0:
			print spaces, "- OUT:", url
			level -= 1;
			return 1
			
	elif url[:80].count(base_domain_nowww) > 0:
		pass
		
	else:
		url = "http://" + base_domain + "/" + url
		
	# 2. Yes, we do; reformat
	url = "http://" + url.replace('http://', "").strip()

	# 3. No duplicates
	if history.count(url) > 0: 
		level -= 1
		return 1

	history.append(url)	

	# 4. Fetch Content
#	print url
	c = get_content(url)
	
	# 5. 404?
	if c == False: 
		print spaces, "- 404:", url
		level -= 1
		return 1
	print spaces, "-  OK:", url
	
	arr = c.split(">")

	# 6. Search or <a tags 
	for a in arr:
		if a.count('<a ') > 0:
			if a.count('href') == 0:
				level -= 1
				return 1
				
			s = a[a.index('href')+5:].strip()
			# Extract Link to string s
			if s[0] == '"':
				s = s[1:]
				s = s[:s.index('"')]
			elif s[0] == "'":
				s = s[1:]
				s = s[:s.index("'")]

			# Recurse
			check_content(s, level+1)
	level -= 1
	
def main():
	global base_domain
	global base_domain_nowww
	
	print "linkchecker 0.1"
	if len(sys.argv) != 2:
		print "- please supply url as argument"
		print "- like: " + sys.argv[0] + " www.google.com"
		sys.exit(0)
	
	print "depth - status : url"
	base_domain = sys.argv[1].replace("http://", "")
	if base_domain[-1] == '/': base_domain = base_domain[:-1]
	base_domain_nowww = base_domain.replace('www.', "")

	check_content(base_domain)
	
if __name__ == "__main__":
	main()