Linkchecker.py
Jump to navigation
Jump to search
This is an example of a recursing linkchecker.
python linkchecker.py www.laptop.org
- stays inside laptop.org
- status:
- OK ... all fine
- 404 .. not found
- SRY .. https
linkchecker.py
#! /usr/bin/env python import sys import urllib2 history = [] def get_content(get_url): f = False try: f = urllib2.urlopen(get_url).read() except urllib2.HTTPError, (a): pass return f def check_content(url, level = 0): spaces = level # 1. Do we like the URL? if len(url) == 0: level -= 1 return 1 if url[:6] == 'mailto': level -= 1; return 1 if url[0] == '/': url = "http://" + base_domain + url elif url[:8] == 'https://': print spaces, "- SRY:", url level -= 1 return 1 elif url[:7] == 'http://': if url[:80].count(base_domain_nowww) == 0: print spaces, "- OUT:", url level -= 1; return 1 elif url[:80].count(base_domain_nowww) > 0: pass else: url = "http://" + base_domain + "/" + url # 2. Yes, we do; reformat url = "http://" + url.replace('http://', "").strip() # 3. No duplicates if history.count(url) > 0: level -= 1 return 1 history.append(url) # 4. Fetch Content # print url c = get_content(url) # 5. 404? if c == False: print spaces, "- 404:", url level -= 1 return 1 print spaces, "- OK:", url arr = c.split(">") # 6. Search or <a tags for a in arr: if a.count('<a ') > 0: if a.count('href') == 0: level -= 1 return 1 s = a[a.index('href')+5:].strip() # Extract Link to string s if s[0] == '"': s = s[1:] s = s[:s.index('"')] elif s[0] == "'": s = s[1:] s = s[:s.index("'")] # Recurse check_content(s, level+1) level -= 1 def main(): global base_domain global base_domain_nowww print "linkchecker 0.1" if len(sys.argv) != 2: print "- please supply url as argument" print "- like: " + sys.argv[0] + " www.google.com" sys.exit(0) print "depth - status : url" base_domain = sys.argv[1].replace("http://", "") if base_domain[-1] == '/': base_domain = base_domain[:-1] base_domain_nowww = base_domain.replace('www.', "") check_content(base_domain) if __name__ == "__main__": main()