From 4040c521140171a0458123601d46ebcf1526f135 Mon Sep 17 00:00:00 2001 From: Jagraj Aulakh Date: Sun, 12 Mar 2023 14:25:22 -0400 Subject: [PATCH] User runs main file with url, the program will recursively parse links on the url and keep crawling on those urls. Graph gets outputted to a text file --- main.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 main.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..0a30615 --- /dev/null +++ b/main.py @@ -0,0 +1,72 @@ +import requests +import sys +import re +from time import sleep +from pprint import pprint + + +class SiteNode: + i = 0 + + def __init__(self, url, sites): + self.url = url + self.sites = sites + self.id = SiteNode.i + SiteNode.i += 1 + + def __repr__(self): + return f"" + +graph: dict[str, SiteNode] = {} + + +def parseLinks(html) -> list[str]: + anchor_tag_pattern = re.compile('href="http[0-9A-Za-z\\.\\/=:\\?\\&_;\\-%]+"') + result = anchor_tag_pattern.findall(html) + + links = [] + for h in result: + links.append(h[6:-1]) + + return links + + +def crawl(url) -> bool: + if graph.get(url, None) is not None: + print(f'Site {url} already visited') + return False + + resp = requests.get(url) + if resp.status_code == 200: + links: list[str] = parseLinks(resp.text) + node: SiteNode = SiteNode(url, links) + + graph[url] = node + + for l in links: + sleep(1) + crawl(l) + return True + + print(f'URL returned response code {resp.status_code}') + return False + + +if __name__ == "__main__": + argv: list[str] = sys.argv + argc: int = len(argv) + + if argc != 2: + print(f"Usage: python {argv[0]} ") + + start_url: str = argv[1] + + crawl(start_url) + pprint(graph) + + f = open('graph.txt', 'w') + for key in graph.keys(): + for link in graph[key].sites: + f.write(f"{graph[key].id}\t{graph[link].id}\n") + + f.close()