comp-4800-web-crawler/main.py

import requests
import sys
import re
from time import sleep
from pprint import pprint


class SiteNode:
    i = 0

    def __init__(self, url, sites):
        self.url = url
        self.sites = sites
        self.id = SiteNode.i
        SiteNode.i += 1

    def __repr__(self):
        return f"<SiteNode id={self.id} url={self.url} sites={self.sites}>"

graph: dict[str, SiteNode] = {}


def parseLinks(html) -> list[str]:
    anchor_tag_pattern = re.compile('href="http[0-9A-Za-z\\.\\/=:\\?\\&_;\\-%]+"')
    result = anchor_tag_pattern.findall(html)

    links = []
    for h in result:
        links.append(h[6:-1])

    return links


def crawl(url) -> bool:
    if graph.get(url, None) is not None:
        print(f'Site {url} already visited')
        return False

    resp = requests.get(url)
    if resp.status_code == 200:
        links: list[str] = parseLinks(resp.text)
        node: SiteNode = SiteNode(url, links)

        graph[url] = node

        for l in links:
            sleep(1)
            crawl(l)
        return True

    print(f'URL returned response code {resp.status_code}')
    return False


if __name__ == "__main__":
    argv: list[str] = sys.argv
    argc: int = len(argv)

    if argc != 2:
        print(f"Usage: python {argv[0]} <webpage>")

    start_url: str = argv[1]

    crawl(start_url)
    pprint(graph)

    f = open('graph.txt', 'w')
    for key in graph.keys():
        for link in graph[key].sites:
            f.write(f"{graph[key].id}\t{graph[link].id}\n")

    f.close()