Linkchecker.py: Difference between revisions
Jump to navigation
Jump to search
Crazy-chris (talk | contribs) m (+OUT) |
Crazy-chris (talk | contribs) m (better demo link) |
||
(One intermediate revision by the same user not shown) | |||
Line 3: | Line 3: | ||
python linkchecker.py www.laptop.org |
python linkchecker.py www.laptop.org |
||
* stays inside laptop.org (if you want to stay inside www.laptop.org, change in line 34 and 39 'base_domain_nowww' to 'base_domain') |
|||
* stays inside laptop.org |
|||
* status: |
* status messages: |
||
** OK ... all fine |
** OK ... all fine |
||
** OUT .. link out of domain |
** OUT .. link out of domain |
||
** 404 .. not found |
** 404 .. not found |
||
** SRY .. https |
** SRY .. https |
||
* display (lv - status : url) |
|||
** lv = level of recursion (depth) |
|||
** status |
|||
** url |
|||
linkchecker.py |
linkchecker.py |
||
#! /usr/bin/env python |
#! /usr/bin/env python |
||
# script-code started by crazy-chris |
|||
import sys |
import sys |
||
import urllib2 |
import urllib2 |
||
Line 106: | Line 112: | ||
if len(sys.argv) != 2: |
if len(sys.argv) != 2: |
||
print "- please supply url as argument" |
print "- please supply url as argument" |
||
print "- like: " + sys.argv[0] + " www. |
print "- like: " + sys.argv[0] + " www.laptop.org" |
||
sys.exit(0) |
sys.exit(0) |
||
Latest revision as of 15:52, 30 November 2007
This is an example of a recursing linkchecker.
python linkchecker.py www.laptop.org
- stays inside laptop.org (if you want to stay inside www.laptop.org, change in line 34 and 39 'base_domain_nowww' to 'base_domain')
- status messages:
- OK ... all fine
- OUT .. link out of domain
- 404 .. not found
- SRY .. https
- display (lv - status : url)
- lv = level of recursion (depth)
- status
- url
linkchecker.py
#! /usr/bin/env python # script-code started by crazy-chris import sys import urllib2 history = [] def get_content(get_url): f = False try: f = urllib2.urlopen(get_url).read() except urllib2.HTTPError, (a): pass return f def check_content(url, level = 0): spaces = level # 1. Do we like the URL? if len(url) == 0: level -= 1 return 1 if url[:6] == 'mailto': level -= 1; return 1 if url[0] == '/': url = "http://" + base_domain + url elif url[:8] == 'https://': print spaces, "- SRY:", url level -= 1 return 1 elif url[:7] == 'http://': if url[:80].count(base_domain_nowww) == 0: print spaces, "- OUT:", url level -= 1; return 1 elif url[:80].count(base_domain_nowww) > 0: pass else: url = "http://" + base_domain + "/" + url # 2. Yes, we do; reformat url = "http://" + url.replace('http://', "").strip() # 3. No duplicates if history.count(url) > 0: level -= 1 return 1 history.append(url) # 4. Fetch Content # print url c = get_content(url) # 5. 404? if c == False: print spaces, "- 404:", url level -= 1 return 1 print spaces, "- OK:", url arr = c.split(">") # 6. Search or <a tags for a in arr: if a.count('<a ') > 0: if a.count('href') == 0: level -= 1 return 1 s = a[a.index('href')+5:].strip() # Extract Link to string s if s[0] == '"': s = s[1:] s = s[:s.index('"')] elif s[0] == "'": s = s[1:] s = s[:s.index("'")] # Recurse check_content(s, level+1) level -= 1 def main(): global base_domain global base_domain_nowww print "linkchecker 0.1" if len(sys.argv) != 2: print "- please supply url as argument" print "- like: " + sys.argv[0] + " www.laptop.org" sys.exit(0) print "depth - status : url" base_domain = sys.argv[1].replace("http://", "") if base_domain[-1] == '/': base_domain = base_domain[:-1] base_domain_nowww = base_domain.replace('www.', "") check_content(base_domain) if __name__ == "__main__": main()