Linkchecker.py: Difference between revisions
Jump to navigation
Jump to search
Crazy-chris (talk | contribs) m (+OUT) |
Crazy-chris (talk | contribs) (how to stay inside www.laptop.org) |
||
Line 3: | Line 3: | ||
python linkchecker.py www.laptop.org |
python linkchecker.py www.laptop.org |
||
* stays inside laptop.org (if you want to stay inside www.laptop.org, change in line 34 and 39 'base_domain_nowww' to 'base_domain') |
|||
* stays inside laptop.org |
|||
* status: |
* status messages: |
||
** OK ... all fine |
** OK ... all fine |
||
** OUT .. link out of domain |
** OUT .. link out of domain |
||
** 404 .. not found |
** 404 .. not found |
||
** SRY .. https |
** SRY .. https |
||
* display (lv - status : url) |
|||
** lv = level of recursion (depth) |
|||
** status |
|||
** url |
|||
Revision as of 16:38, 29 November 2007
This is an example of a recursing linkchecker.
python linkchecker.py www.laptop.org
- stays inside laptop.org (if you want to stay inside www.laptop.org, change in line 34 and 39 'base_domain_nowww' to 'base_domain')
- status messages:
- OK ... all fine
- OUT .. link out of domain
- 404 .. not found
- SRY .. https
- display (lv - status : url)
- lv = level of recursion (depth)
- status
- url
linkchecker.py
#! /usr/bin/env python import sys import urllib2 history = [] def get_content(get_url): f = False try: f = urllib2.urlopen(get_url).read() except urllib2.HTTPError, (a): pass return f def check_content(url, level = 0): spaces = level # 1. Do we like the URL? if len(url) == 0: level -= 1 return 1 if url[:6] == 'mailto': level -= 1; return 1 if url[0] == '/': url = "http://" + base_domain + url elif url[:8] == 'https://': print spaces, "- SRY:", url level -= 1 return 1 elif url[:7] == 'http://': if url[:80].count(base_domain_nowww) == 0: print spaces, "- OUT:", url level -= 1; return 1 elif url[:80].count(base_domain_nowww) > 0: pass else: url = "http://" + base_domain + "/" + url # 2. Yes, we do; reformat url = "http://" + url.replace('http://', "").strip() # 3. No duplicates if history.count(url) > 0: level -= 1 return 1 history.append(url) # 4. Fetch Content # print url c = get_content(url) # 5. 404? if c == False: print spaces, "- 404:", url level -= 1 return 1 print spaces, "- OK:", url arr = c.split(">") # 6. Search or <a tags for a in arr: if a.count('<a ') > 0: if a.count('href') == 0: level -= 1 return 1 s = a[a.index('href')+5:].strip() # Extract Link to string s if s[0] == '"': s = s[1:] s = s[:s.index('"')] elif s[0] == "'": s = s[1:] s = s[:s.index("'")] # Recurse check_content(s, level+1) level -= 1 def main(): global base_domain global base_domain_nowww print "linkchecker 0.1" if len(sys.argv) != 2: print "- please supply url as argument" print "- like: " + sys.argv[0] + " www.google.com" sys.exit(0) print "depth - status : url" base_domain = sys.argv[1].replace("http://", "") if base_domain[-1] == '/': base_domain = base_domain[:-1] base_domain_nowww = base_domain.replace('www.', "") check_content(base_domain) if __name__ == "__main__": main()