Linkchecker.py

From OLPC
Jump to: navigation, search

This is an example of a recursing linkchecker.

python linkchecker.py www.laptop.org
  • stays inside laptop.org (if you want to stay inside www.laptop.org, change in line 34 and 39 'base_domain_nowww' to 'base_domain')
  • status messages:
    • OK ... all fine
    • OUT .. link out of domain
    • 404 .. not found
    • SRY .. https
  • display (lv - status : url)
    • lv = level of recursion (depth)
    • status
    • url


linkchecker.py

#! /usr/bin/env python
# script-code started by crazy-chris

import sys
import urllib2

history = []

def get_content(get_url):
	f = False
	try: f = urllib2.urlopen(get_url).read()
	except urllib2.HTTPError, (a): pass
  	return f

def check_content(url, level = 0):
	spaces = level
	
	# 1. Do we like the URL?
	if len(url) == 0: 
		level -= 1
		return 1
	
	if url[:6] == 'mailto':
		level -= 1;
		return 1
		
	if url[0] == '/': 
		url = "http://" + base_domain + url

	elif url[:8] == 'https://':
		print spaces, "- SRY:", url
		level -= 1
		return 1

	elif url[:7] == 'http://':
		if url[:80].count(base_domain_nowww) == 0:
			print spaces, "- OUT:", url
			level -= 1;
			return 1
			
	elif url[:80].count(base_domain_nowww) > 0:
		pass
		
	else:
		url = "http://" + base_domain + "/" + url
		
	# 2. Yes, we do; reformat
	url = "http://" + url.replace('http://', "").strip()

	# 3. No duplicates
	if history.count(url) > 0: 
		level -= 1
		return 1

	history.append(url)	

	# 4. Fetch Content
#	print url
	c = get_content(url)
	
	# 5. 404?
	if c == False: 
		print spaces, "- 404:", url
		level -= 1
		return 1
	print spaces, "-  OK:", url
	
	arr = c.split(">")

	# 6. Search or <a tags 
	for a in arr:
		if a.count('<a ') > 0:
			if a.count('href') == 0:
				level -= 1
				return 1
				
			s = a[a.index('href')+5:].strip()
			# Extract Link to string s
			if s[0] == '"':
				s = s[1:]
				s = s[:s.index('"')]
			elif s[0] == "'":
				s = s[1:]
				s = s[:s.index("'")]

			# Recurse
			check_content(s, level+1)
	level -= 1
	
def main():
	global base_domain
	global base_domain_nowww
	
	print "linkchecker 0.1"
	if len(sys.argv) != 2:
		print "- please supply url as argument"
		print "- like: " + sys.argv[0] + " www.laptop.org"
		sys.exit(0)
	
	print "depth - status : url"
	base_domain = sys.argv[1].replace("http://", "")
	if base_domain[-1] == '/': base_domain = base_domain[:-1]
	base_domain_nowww = base_domain.replace('www.', "")

	check_content(base_domain)
	
if __name__ == "__main__":
	main()