import requests import sys import re from time import sleep from pprint import pprint class SiteNode: i = 0 def __init__(self, url, sites): self.url = url self.sites = sites self.id = SiteNode.i SiteNode.i += 1 def __repr__(self): return f"" graph: dict[str, SiteNode] = {} def parseLinks(html) -> list[str]: anchor_tag_pattern = re.compile('href="http[0-9A-Za-z\\.\\/=:\\?\\&_;\\-%]+"') result = anchor_tag_pattern.findall(html) links = [] for h in result: links.append(h[6:-1]) return links def crawl(url) -> bool: if graph.get(url, None) is not None: print(f'Site {url} already visited') return False resp = requests.get(url) if resp.status_code == 200: links: list[str] = parseLinks(resp.text) node: SiteNode = SiteNode(url, links) graph[url] = node for l in links: sleep(1) crawl(l) return True print(f'URL returned response code {resp.status_code}') return False if __name__ == "__main__": argv: list[str] = sys.argv argc: int = len(argv) if argc != 2: print(f"Usage: python {argv[0]} ") start_url: str = argv[1] crawl(start_url) pprint(graph) f = open('graph.txt', 'w') for key in graph.keys(): for link in graph[key].sites: f.write(f"{graph[key].id}\t{graph[link].id}\n") f.close()