User runs main file with url, the program will recursively parse links on the url and keep crawling on those urls. Graph gets outputted to a text file
This commit is contained in:
72
main.py
Normal file
72
main.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
import requests
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
from time import sleep
|
||||||
|
from pprint import pprint
|
||||||
|
|
||||||
|
|
||||||
|
class SiteNode:
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
def __init__(self, url, sites):
|
||||||
|
self.url = url
|
||||||
|
self.sites = sites
|
||||||
|
self.id = SiteNode.i
|
||||||
|
SiteNode.i += 1
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<SiteNode id={self.id} url={self.url} sites={self.sites}>"
|
||||||
|
|
||||||
|
graph: dict[str, SiteNode] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def parseLinks(html) -> list[str]:
|
||||||
|
anchor_tag_pattern = re.compile('href="http[0-9A-Za-z\\.\\/=:\\?\\&_;\\-%]+"')
|
||||||
|
result = anchor_tag_pattern.findall(html)
|
||||||
|
|
||||||
|
links = []
|
||||||
|
for h in result:
|
||||||
|
links.append(h[6:-1])
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def crawl(url) -> bool:
|
||||||
|
if graph.get(url, None) is not None:
|
||||||
|
print(f'Site {url} already visited')
|
||||||
|
return False
|
||||||
|
|
||||||
|
resp = requests.get(url)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
links: list[str] = parseLinks(resp.text)
|
||||||
|
node: SiteNode = SiteNode(url, links)
|
||||||
|
|
||||||
|
graph[url] = node
|
||||||
|
|
||||||
|
for l in links:
|
||||||
|
sleep(1)
|
||||||
|
crawl(l)
|
||||||
|
return True
|
||||||
|
|
||||||
|
print(f'URL returned response code {resp.status_code}')
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
argv: list[str] = sys.argv
|
||||||
|
argc: int = len(argv)
|
||||||
|
|
||||||
|
if argc != 2:
|
||||||
|
print(f"Usage: python {argv[0]} <webpage>")
|
||||||
|
|
||||||
|
start_url: str = argv[1]
|
||||||
|
|
||||||
|
crawl(start_url)
|
||||||
|
pprint(graph)
|
||||||
|
|
||||||
|
f = open('graph.txt', 'w')
|
||||||
|
for key in graph.keys():
|
||||||
|
for link in graph[key].sites:
|
||||||
|
f.write(f"{graph[key].id}\t{graph[link].id}\n")
|
||||||
|
|
||||||
|
f.close()
|
||||||
Reference in New Issue
Block a user