~wrycode/plr

2be2df09862dda4ac062a9645dda478be0245a30 — Nick Econopouly 5 months ago 21cb75a
Refactored everything into plr.py.

- also removed asynchronous link lookup (might reimplement later)
3 files changed, 58 insertions(+), 88 deletions(-)

D lib.py
M plr.py
D requirements.txt
D lib.py => lib.py +0 -73
@@ 1,73 0,0 @@
# -*- coding: utf-8 -*-
import sys
import re
import json
from multiprocessing import Pool, cpu_count

import requests


def isurl(s):
    if re.match(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', s):
        return True
    else:
        return False

  
def getWebArchiveLink(url):
  # will need to handle cases that it can't find, like for http://w3techs.com/technologies/overview/javascript_library/all
  if 'web.archive' in url:
    return url,url
  try:
    r = requests.get('https://web.archive.org/save/' + url)
    print("Got permanent link for " + url)
  except:
    return url,url
  if r.status_code == 403:
    return url,url
  else:
    try:
      return url,'https://web.archive.org' + r.headers['content-location']
    except:
      print(url)
      return url,url
    

def replaceText(text_test,apikey):
  urls = []
  urls_in_order = []
  for url in  re.findall(r'(https?://[^\s]+)', text_test):
    newurl = url.split('"')[0].split('<')[0]
    while newurl[-1] == '.' or newurl[-1] == ')' or newurl[-1] == '!':
      newurl = newurl[:-1]
    if not apikey:
      urls.append(newurl)
    else:
      urls.append((newurl,apikey))
    urls_in_order.append(newurl)


  f = getWebArchiveLink
  if apikey:
    f = getPermaccLink
  p = Pool(cpu_count())
  conversion = {}
  for result in p.map(f, list(set(urls))):
    conversion[result[0]] = result[1]    
  p.terminate()

  print(conversion)
  curPos = 0
  for url in urls_in_order:
    if url in text_test[curPos:]:
      print(url)
      print(conversion[url])
      print(text_test[curPos:])
      newPos = text_test.index(url)
      text_test = text_test[0:curPos] + text_test[curPos:].replace(url,conversion[url],1)
      curPos = newPos

  return text_test  




M plr.py => plr.py +58 -8
@@ 1,11 1,15 @@
from lib import *
# -*- coding: utf-8 -*-
import sys
import argparse
import re
import json
import requests

def parseargs():
    parser = argparse.ArgumentParser()
    parser.add_argument('-if', nargs='?', type=argparse.FileType('r'),
    parser.add_argument('-i', nargs='?', type=argparse.FileType('r'),
                        default=sys.stdin, action='store', dest='input')
    parser.add_argument('-of', nargs='?', type=argparse.FileType('w'),
    parser.add_argument('-o', nargs='?', type=argparse.FileType('w'),
                         default=sys.stdout, action='store', dest='output')
    return parser.parse_args()



@@ 18,8 22,54 @@ def main():
        writeoutput = print

    content = args.input.read()
    content = replaceText(content,False)
    writeoutput(content)
        
if __name__ == '__main__':
    main()
    content = replaceLinks(content)
    #writeoutput(content)


def isurl(s):
    if re.match(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', s):
        return True
    else:
        return False

def getWebArchiveLink(url):
  if 'web.archive' in url:
    return url
  try:
    r = requests.get('https://archive.org/wayback/available?url=' + url)
    rjson = json.loads(r.content)
  except:
    return url
  if not 'closest' in rjson['archived_snapshots']:
    return url
  else:
    try:
      return rjson['archived_snapshots']['closest']['url']
    except:
      return url

def replaceLinks(text):
  urls = []
  for url in  re.findall(r'(https?://[^\s]+)', text):
    newurl = url.split('"')[0].split('<')[0]
    while newurl[-1] == '.' or newurl[-1] == ')' or newurl[-1] == '!':
      newurl = newurl[:-1]
      urls.append(newurl)

  newurls = []
  for url in urls:
      newurls.append(getWebArchiveLink(url))

#  print(conversion)
  curPos = 0
  for url, newurl in zip(urls,newurls):
    if url in text[curPos:]:
      print('url:' + url)
      print('new url:' + newurl)
      # print(text[curPos:])
      newPos = text.index(url)
      text = text[0:curPos] + text[curPos:].replace(url,newurl)
      curPos = newPos
  return text  

main()

D requirements.txt => requirements.txt +0 -7
@@ 1,7 0,0 @@
Flask==0.10.1
itsdangerous==0.24
Jinja2==2.8
MarkupSafe==0.23
requests==2.7.0
Werkzeug==0.10.4
wheel==0.24.0