import re
import sys
import asyncio
import argparse
from pyppeteer import *
from bs4 import BeautifulSoup
urls_to_check = []
args = []
LOGIN_URL = "https://censys.io/login"
SEARCH_URL = "https://censys.io/certificates?q="
USERNAME_SEL = "#login"
PASSWORD_SEL = "#password"
NEXT_BTN_SEL= "#content > div:nth-child(2) > div:nth-child(2) > div > div > div > div > form > div:nth-child(5) > button"
LOGIN_BTN_SEL = "#content > div:nth-child(2) > div:nth-child(2) > div > div > div > div > form > div:nth-child(9) > button"
PAGE_SEL = "#resultset > div.SearchResultSectionHeader--hack-in-bottom-margin > div > div > span:nth-child(1)"
async def typeout(page,sel,inp):
await page.waitForSelector(sel)
await page.type(sel,inp,delay=5)
async def clickit(page,sel):
try:
await page.waitForSelector(sel)
await page.click(sel)
except:
print("error, trying again for element")
async def login(page,url,username,password):
await page.goto(url)
await page.waitForSelector(USERNAME_SEL) # Waiting for it to render
await typeout(page,USERNAME_SEL,username)
await clickit(page,NEXT_BTN_SEL)
await asyncio.sleep(1.0)
await page.waitForSelector(PASSWORD_SEL)
await typeout(page,PASSWORD_SEL,password)
await clickit(page,LOGIN_BTN_SEL)
await asyncio.sleep(1.0)
async def parse_page_num(page):
await page.waitForSelector(PAGE_SEL)
element = await page.querySelector(PAGE_SEL)
text_content = await element.getProperty('textContent')
text_content = await text_content.jsonValue()
if not text_content:
print("couldn't determine page count, try adding this manually with the --page param")
sys.exit(1)
page_num = int(text_content.strip().split("/")[-1].replace(",",''))
return page_num
async def scrape_page(page,num,domain,arr):
#pattern = "([^\s]*\.)*{}".format(domain)
# this is trash but midnight regex makes me sad
pattern = "[\w\d\.]*{}".format(domain)
url = SEARCH_URL + domain + "&page={}".format(num)
print("-Page {}".format(num))
await page.goto(url)
await asyncio.sleep(1)
all_text = await page.evaluate('document.documentElement.outerHTML')
while "Error 429" in all_text:
print("Rate limiting detected... waiting. checking in 45s intervals")
await asyncio.sleep(45)
await page.goto(url)
all_text = await page.evaluate('document.documentElement.outerHTML')
finds = re.findall(pattern,all_text)
for f in finds:
if f not in arr:
arr.append(f)
async def main(args):
bucket = []
username = args.username
password = args.password
page_num = int(args.page)
domain = args.domain
filename = 'scraper-output.txt'
if args.filename:
filename = args.filename
if page_num:
if page_num > 40:
page_num = 40
print("Set number of pages to 40, censys limit")
browser=await launch(args=['--no-sandbox'],headless=True) # Set this to False if you wish to see the browser
page = await browser.newPage()
await login(page,LOGIN_URL,username,password)
# Get number of pages
await page.goto(SEARCH_URL)
if not page_num:
page_num = await parse_page_num(page)
print("Scraping {} pages".format(page_num))
for n in range(1,(page_num+1)):
await asyncio.sleep(1.0)
try:
stuff = await scrape_page(page,n,domain,bucket)
if stuff:
for s in stuff:
if s not in bucket:
print ("Discovered:",s)
bucket.append(s)
except Exception as e:
print(e)
input("ERR... waiting for you to hit enter 1/2")
input("ERR... waiting for you to hit enter 2/2")
for subdomain in bucket:
print (subdomain)
print("Found {} subdomains".format(len(bucket)))
try:
with open(filename,'w') as fh:
fh.write('\n'.join(bucket))
print("Written to {}".format(filename))
except Exception as e:
print("Failed to write to file: {}".format(e))
print("Finished")
await browser.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--page",help="(optional)number of pages to scrape, otherwise this will be parsed from page", required=True)
parser.add_argument("--domain",help="domain for which to find subdomains",required=True)
parser.add_argument("--username",help="username to login to censys",required=True)
parser.add_argument("--password",help="password to login to censys",required=True)
parser.add_argument("--filename",help="(optional)name of output file, defaults to scraper-output.txt",required=False)
args = parser.parse_args()
asyncio.get_event_loop().run_until_complete(main(args))