From 4040c521140171a0458123601d46ebcf1526f135 Mon Sep 17 00:00:00 2001
From: Jagraj Aulakh <aulakhjagraj@gmail.com>
Date: Sun, 12 Mar 2023 14:25:22 -0400
Subject: [PATCH] User runs main file with url, the program will recursively
 parse links on the url and keep crawling on those urls. Graph gets outputted
 to a text file

---
 main.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 main.py
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..0a30615
--- /dev/null
+++ b/main.py
@@ -0,0 +1,72 @@
+import requests
+import sys
+import re
+from time import sleep
+from pprint import pprint
+
+
+class SiteNode:
+    i = 0
+
+    def __init__(self, url, sites):
+        self.url = url
+        self.sites = sites
+        self.id = SiteNode.i
+        SiteNode.i += 1
+
+    def __repr__(self):
+        return f"<SiteNode id={self.id} url={self.url} sites={self.sites}>"
+
+graph: dict[str, SiteNode] = {}
+
+
+def parseLinks(html) -> list[str]:
+    anchor_tag_pattern = re.compile('href="http[0-9A-Za-z\\.\\/=:\\?\\&_;\\-%]+"')
+    result = anchor_tag_pattern.findall(html)
+
+    links = []
+    for h in result:
+        links.append(h[6:-1])
+
+    return links
+
+
+def crawl(url) -> bool:
+    if graph.get(url, None) is not None:
+        print(f'Site {url} already visited')
+        return False
+
+    resp = requests.get(url)
+    if resp.status_code == 200:
+        links: list[str] = parseLinks(resp.text)
+        node: SiteNode = SiteNode(url, links)
+
+        graph[url] = node
+
+        for l in links:
+            sleep(1)
+            crawl(l)
+        return True
+
+    print(f'URL returned response code {resp.status_code}')
+    return False
+
+
+if __name__ == "__main__":
+    argv: list[str] = sys.argv
+    argc: int = len(argv)
+
+    if argc != 2:
+        print(f"Usage: python {argv[0]} <webpage>")
+
+    start_url: str = argv[1]
+
+    crawl(start_url)
+    pprint(graph)
+
+    f = open('graph.txt', 'w')
+    for key in graph.keys():
+        for link in graph[key].sites:
+            f.write(f"{graph[key].id}\t{graph[link].id}\n")
+
+    f.close()