Linkchecker.py

From OLPC
Jump to navigation Jump to search
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

This is an example of a recursing linkchecker.

python linkchecker.py www.laptop.org
  • stays inside laptop.org (if you want to stay inside www.laptop.org, change in line 34 and 39 'base_domain_nowww' to 'base_domain')
  • status messages:
    • OK ... all fine
    • OUT .. link out of domain
    • 404 .. not found
    • SRY .. https
  • display (lv - status : url)
    • lv = level of recursion (depth)
    • status
    • url


linkchecker.py

#! /usr/bin/env python
# script-code started by crazy-chris

import sys
import urllib2

history = []

def get_content(get_url):
	f = False
	try: f = urllib2.urlopen(get_url).read()
	except urllib2.HTTPError, (a): pass
  	return f

def check_content(url, level = 0):
	spaces = level
	
	# 1. Do we like the URL?
	if len(url) == 0: 
		level -= 1
		return 1
	
	if url[:6] == 'mailto':
		level -= 1;
		return 1
		
	if url[0] == '/': 
		url = "http://" + base_domain + url

	elif url[:8] == 'https://':
		print spaces, "- SRY:", url
		level -= 1
		return 1

	elif url[:7] == 'http://':
		if url[:80].count(base_domain_nowww) == 0:
			print spaces, "- OUT:", url
			level -= 1;
			return 1
			
	elif url[:80].count(base_domain_nowww) > 0:
		pass
		
	else:
		url = "http://" + base_domain + "/" + url
		
	# 2. Yes, we do; reformat
	url = "http://" + url.replace('http://', "").strip()

	# 3. No duplicates
	if history.count(url) > 0: 
		level -= 1
		return 1

	history.append(url)	

	# 4. Fetch Content
#	print url
	c = get_content(url)
	
	# 5. 404?
	if c == False: 
		print spaces, "- 404:", url
		level -= 1
		return 1
	print spaces, "-  OK:", url
	
	arr = c.split(">")

	# 6. Search or <a tags 
	for a in arr:
		if a.count('<a ') > 0:
			if a.count('href') == 0:
				level -= 1
				return 1
				
			s = a[a.index('href')+5:].strip()
			# Extract Link to string s
			if s[0] == '"':
				s = s[1:]
				s = s[:s.index('"')]
			elif s[0] == "'":
				s = s[1:]
				s = s[:s.index("'")]

			# Recurse
			check_content(s, level+1)
	level -= 1
	
def main():
	global base_domain
	global base_domain_nowww
	
	print "linkchecker 0.1"
	if len(sys.argv) != 2:
		print "- please supply url as argument"
		print "- like: " + sys.argv[0] + " www.laptop.org"
		sys.exit(0)
	
	print "depth - status : url"
	base_domain = sys.argv[1].replace("http://", "")
	if base_domain[-1] == '/': base_domain = base_domain[:-1]
	base_domain_nowww = base_domain.replace('www.', "")

	check_content(base_domain)
	
if __name__ == "__main__":
	main()