Files
comp-4800-web-crawler/main.py

73 lines
1.5 KiB
Python

import requests
import sys
import re
from time import sleep
from pprint import pprint
class SiteNode:
i = 0
def __init__(self, url, sites):
self.url = url
self.sites = sites
self.id = SiteNode.i
SiteNode.i += 1
def __repr__(self):
return f"<SiteNode id={self.id} url={self.url} sites={self.sites}>"
graph: dict[str, SiteNode] = {}
def parseLinks(html) -> list[str]:
anchor_tag_pattern = re.compile('href="http[0-9A-Za-z\\.\\/=:\\?\\&_;\\-%]+"')
result = anchor_tag_pattern.findall(html)
links = []
for h in result:
links.append(h[6:-1])
return links
def crawl(url) -> bool:
if graph.get(url, None) is not None:
print(f'Site {url} already visited')
return False
resp = requests.get(url)
if resp.status_code == 200:
links: list[str] = parseLinks(resp.text)
node: SiteNode = SiteNode(url, links)
graph[url] = node
for l in links:
sleep(1)
crawl(l)
return True
print(f'URL returned response code {resp.status_code}')
return False
if __name__ == "__main__":
argv: list[str] = sys.argv
argc: int = len(argv)
if argc != 2:
print(f"Usage: python {argv[0]} <webpage>")
start_url: str = argv[1]
crawl(start_url)
pprint(graph)
f = open('graph.txt', 'w')
for key in graph.keys():
for link in graph[key].sites:
f.write(f"{graph[key].id}\t{graph[link].id}\n")
f.close()