~deadjakk/censys-scraper

ref: 7efdee447e5dfdb8278b95e292dc063e56bc94f5 censys-scraper/censys_scrape.py -rwxr-xr-x 4.7 KiB
7efdee44 — deadjakk fixed arg requirement 1 year, 6 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import re
import sys
import asyncio
import argparse
from pyppeteer import *
from bs4 import BeautifulSoup
urls_to_check = []
args = []

LOGIN_URL = "https://censys.io/login"
SEARCH_URL = "https://censys.io/certificates?q="

USERNAME_SEL = "#login"
PASSWORD_SEL = "#password"
NEXT_BTN_SEL= "#content > div:nth-child(2) > div:nth-child(2) > div > div > div > div > form > div:nth-child(5) > button"
LOGIN_BTN_SEL = "#content > div:nth-child(2) > div:nth-child(2) > div > div > div > div > form > div:nth-child(9) > button"
PAGE_SEL = "#resultset > div.SearchResultSectionHeader--hack-in-bottom-margin > div > div > span:nth-child(1)"

async def typeout(page,sel,inp):
    await page.waitForSelector(sel)
    await page.type(sel,inp,delay=5)

async def clickit(page,sel):
    try:
        await page.waitForSelector(sel)
        await page.click(sel)
    except:
        print("error, trying again for element")

async def login(page,url,username,password):
    await page.goto(url)
    await page.waitForSelector(USERNAME_SEL) # Waiting for it to render
    await typeout(page,USERNAME_SEL,username)
    await clickit(page,NEXT_BTN_SEL)
    await asyncio.sleep(1.0)
    await page.waitForSelector(PASSWORD_SEL) 
    await typeout(page,PASSWORD_SEL,password)
    await clickit(page,LOGIN_BTN_SEL)
    await asyncio.sleep(1.0)

async def parse_page_num(page):
    await page.waitForSelector(PAGE_SEL) 
    element = await page.querySelector(PAGE_SEL)
    text_content = await element.getProperty('textContent')
    text_content = await text_content.jsonValue()
    if not text_content:
        print("couldn't determine page count, try adding this manually with the --page param")
        sys.exit(1)
    page_num = int(text_content.strip().split("/")[-1].replace(",",''))
    return page_num

async def scrape_page(page,num,domain,arr):
    #pattern = "([^\s]*\.)*{}".format(domain)
    # this is trash but midnight regex makes me sad
    pattern = "[\w\d\.]*{}".format(domain)
    url = SEARCH_URL + domain + "&page={}".format(num)
    print("-Page {}".format(num))
    await page.goto(url)
    await asyncio.sleep(1)

    all_text = await page.evaluate('document.documentElement.outerHTML')
    while "Error 429" in all_text:
        print("Rate limiting detected... waiting. checking in 45s intervals")
        await asyncio.sleep(45)
        await page.goto(url)
        all_text = await page.evaluate('document.documentElement.outerHTML')
    finds = re.findall(pattern,all_text)
    for f in finds:
        if f not in arr:
            arr.append(f)

async def main(args):
    bucket = []
    username = args.username
    password = args.password
    page_num = int(args.page)
    domain = args.domain
    filename = 'scraper-output.txt'
    if args.filename:
        filename = args.filename

    if page_num:
        if page_num > 40:
            page_num = 40
            print("Set number of pages to 40, censys limit")

    browser=await launch(args=['--no-sandbox'],headless=True) # Set this to False if you wish to see the browser
    page = await browser.newPage()

    await login(page,LOGIN_URL,username,password)

    # Get number of pages
    await page.goto(SEARCH_URL)
    if not page_num:
        page_num = await parse_page_num(page)
    print("Scraping {} pages".format(page_num))

    for n in range(1,(page_num+1)):
        await asyncio.sleep(1.0)
        try:
            stuff = await scrape_page(page,n,domain,bucket)
            if stuff:
                for s in stuff:
                    if s not in bucket:
                        print ("Discovered:",s)
                        bucket.append(s)
        except Exception as e:
            print(e)
            input("ERR... waiting for you to hit enter 1/2")
            input("ERR... waiting for you to hit enter 2/2")

    for subdomain in bucket:
        print (subdomain)
    print("Found {} subdomains".format(len(bucket)))
    try:
        with open(filename,'w') as fh:
            fh.write('\n'.join(bucket))
            print("Written to {}".format(filename))
    except Exception as e:
        print("Failed to write to file: {}".format(e))
    print("Finished")

    await browser.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--page",help="(optional)number of pages to scrape, otherwise this will be parsed from page", required=True)
    parser.add_argument("--domain",help="domain for which to find subdomains",required=True)
    parser.add_argument("--username",help="username to login to censys",required=True)
    parser.add_argument("--password",help="password to login to censys",required=True)
    parser.add_argument("--filename",help="(optional)name of output file, defaults to scraper-output.txt",required=False)
    args = parser.parse_args()

    asyncio.get_event_loop().run_until_complete(main(args))