~deadjakk/censys-scraper

53d67d60e52b61417fecce01dc3c3f152e1c444e — deadjakk 1 year, 5 months ago
init
2 files changed, 159 insertions(+), 0 deletions(-)

A README.md
A censys_scrape.py
A  => README.md +25 -0
@@ 1,25 @@
# Censys.io Subdomain Scraper
I haven't seen a scraper for censys.io that doesn't require an API key as of yet, so I made one.  
Of course, if you would like more than the first 1000 results, you should pay for an API key.   
Uses pyppeteer library to automate the scraping of https://censys.io for the purposes of discovering subdomains.   
Will wait for rate-limiting. Output is written to scraper-output.txt, nothing fancy.   

Sample command:
`python3 ./censys_scrape.py --username USERNAME --password PASSWORD --domain censys.io --page 40`

Help output:
```
python3 ./censys_scrape.py  --help
usage: censys_scrape.py [-h] [--page PAGE] --domain DOMAIN --username USERNAME --password PASSWORD [--filename FILENAME]

optional arguments:
  -h, --help           show this help message and exit
  --page PAGE          (optional)number of pages to scrape, otherwise this will be parsed from page
  --domain DOMAIN      domain for which to find subdomains
  --username USERNAME  username to login to censys
  --password PASSWORD  password to login to censys
  --filename FILENAME  (optional)name of output file, defaults to scraper-output.txt
```

Bugs:
- Regex pattern is sub-par, will fix later.

A  => censys_scrape.py +134 -0
@@ 1,134 @@
import re
import sys
import asyncio
import argparse
from pyppeteer import *
from bs4 import BeautifulSoup
urls_to_check = []
args = []

LOGIN_URL = "https://censys.io/login"
SEARCH_URL = "https://censys.io/certificates?q="

USERNAME_SEL = "#login"
PASSWORD_SEL = "#password"
NEXT_BTN_SEL= "#content > div:nth-child(2) > div:nth-child(2) > div > div > div > div > form > div:nth-child(5) > button"
LOGIN_BTN_SEL = "#content > div:nth-child(2) > div:nth-child(2) > div > div > div > div > form > div:nth-child(9) > button"
PAGE_SEL = "#resultset > div.SearchResultSectionHeader--hack-in-bottom-margin > div > div > span:nth-child(1)"

async def typeout(page,sel,inp):
    await page.waitForSelector(sel)
    await page.type(sel,inp,delay=5)

async def clickit(page,sel):
    try:
        await page.waitForSelector(sel)
        await page.click(sel)
    except:
        print("error, trying again for element")

async def login(page,url,username,password):
    await page.goto(url)
    await page.waitForSelector(USERNAME_SEL) # Waiting for it to render
    await typeout(page,USERNAME_SEL,username)
    await clickit(page,NEXT_BTN_SEL)
    await asyncio.sleep(1.0)
    await page.waitForSelector(PASSWORD_SEL) 
    await typeout(page,PASSWORD_SEL,password)
    await clickit(page,LOGIN_BTN_SEL)
    await asyncio.sleep(1.0)

async def parse_page_num(page):
    await page.waitForSelector(PAGE_SEL) 
    element = await page.querySelector(PAGE_SEL)
    text_content = await element.getProperty('textContent')
    text_content = await text_content.jsonValue()
    if not text_content:
        print("couldn't determine page count, try adding this manually with the --page param")
        sys.exit(1)
    page_num = int(text_content.strip().split("/")[-1].replace(",",''))
    return page_num

async def scrape_page(page,num,domain,arr):
    #pattern = "([^\s]*\.)*{}".format(domain)
    # this is trash but midnight regex makes me sad
    pattern = "[\w\d\.]*{}".format(domain)
    url = SEARCH_URL + domain + "&page={}".format(num)
    print("-Page {}".format(num))
    await page.goto(url)
    await asyncio.sleep(1)

    all_text = await page.evaluate('document.documentElement.outerHTML')
    while "Error 429" in all_text:
        print("Rate limiting detected... waiting. checking in 45s intervals")
        await asyncio.sleep(45)
        await page.goto(url)
        all_text = await page.evaluate('document.documentElement.outerHTML')
    finds = re.findall(pattern,all_text)
    for f in finds:
        if f not in arr:
            arr.append(f)

async def main(args):
    bucket = []
    username = args.username
    password = args.password
    page_num = int(args.page)
    domain = args.domain
    filename = 'scraper-output.txt'
    if args.filename:
        filename = args.filename

    if page_num:
        if page_num > 40:
            page_num = 40
            print("Set number of pages to 40, censys limit")

    browser=await launch(args=['--no-sandbox'],headless=True) # Set this to False if you wish to see the browser
    page = await browser.newPage()

    await login(page,LOGIN_URL,username,password)

    # Get number of pages
    await page.goto(SEARCH_URL)
    if not page_num:
        page_num = await parse_page_num(page)
    print("Scraping {} pages".format(page_num))

    for n in range(1,(page_num+1)):
        await asyncio.sleep(1.0)
        try:
            stuff = await scrape_page(page,n,domain,bucket)
            if stuff:
                for s in stuff:
                    if s not in bucket:
                        print ("Discovered:",s)
                        bucket.append(s)
        except Exception as e:
            print(e)
            input("ERR... waiting for you to hit enter 1/2")
            input("ERR... waiting for you to hit enter 2/2")

    for subdomain in bucket:
        print (subdomain)
    print("Found {} subdomains".format(len(bucket)))
    try:
        with open(filename,'w') as fh:
            fh.write('\n'.join(bucket))
            print("Written to {}".format(filename))
    except Exception as e:
        print("Failed to write to file: {}".format(e))
    print("Finished")

    await browser.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("--page",help="(optional)number of pages to scrape, otherwise this will be parsed from page")
    parser.add_argument("--domain",help="domain for which to find subdomains",required=True)
    parser.add_argument("--username",help="username to login to censys",required=True)
    parser.add_argument("--password",help="password to login to censys",required=True)
    parser.add_argument("--filename",help="(optional)name of output file, defaults to scraper-output.txt",required=False)
    args = parser.parse_args()

    asyncio.get_event_loop().run_until_complete(main(args))