~jheckt/GoofyStuff

29719e6c2d4bdbe54deead4d9bcefe1301f3ebc1 — turnipsoup 1 year, 3 months ago f19213c
fixed bug with missing values causing fails. need to not check endpoints multiple times
1 files changed, 28 insertions(+), 16 deletions(-)

M sleuthing/check_robots.py
M sleuthing/check_robots.py => sleuthing/check_robots.py +28 -16
@@ 2,10 2,16 @@ import requests, sys


def check_status_code(url, endpoint):
    """
    Gets the HTTP status code of the passed URL and endpoint. Returns the code
    """
    return requests.get(f"{url}{endpoint}").status_code


def check_endpoint(endpoint):
    """
    Checks if the endpount is up, prints result to screen
    """
    try:
        print(endpoint, "->", check_status_code(target_url, endpoint))



@@ 14,10 20,28 @@ def check_endpoint(endpoint):


def check_endpoint_list(endpoint_list):
    """
    Checks all endpoints in the passed list
    """
    for endpoint in endpoint_list:
        check_endpoint(endpoint)


def clean_list(endpoint_list):
    """
    Cleans the endpoint list and removes the labels. Returns a cleaned list.
    """
    rtr = []

    for a in endpoint_list:
        try:
            rtr.append(a.split(": ")[1])
        except IndexError:
            rtr.append("")

    return rtr


if __name__ == "__main__":
    target_url_robots = sys.argv[1]
    target_url = target_url_robots.split("/robots.txt")[0]


@@ 42,22 66,10 @@ if __name__ == "__main__":
    # Get disallows
    disallows = [x for x in endpoints if "disallow" in x.lower()]

    # Clean fetched items
    try:
        clean_sitemaps = [x.split(": ")[1] for x in sitemaps]
    except IndexError:
        clean_sitemaps = []

    try:
        clean_allows = [x.split(": ")[1] for x in allows]
    except IndexError:
        clean_allows = []


    try:
        clean_disallows = [x.split(": ")[1] for x in disallows]
    except IndexError:
        clean_disallows = []
    # Clean all of the results
    clean_sitemaps = clean_list(sitemaps)
    clean_allows = clean_list(allows)
    clean_disallows = clean_list(disallows)

    # Checkthe endpoints and print to screen
    print("Endpoint -> Status Code")