2be2df09862dda4ac062a9645dda478be0245a30 — Nick Econopouly 26 days ago 21cb75a
Refactored everything into plr.py.

- also removed asynchronous link lookup (might reimplement later)
3 files changed, 58 insertions(+), 88 deletions(-)

D lib.py
M plr.py
D requirements.txt
D lib.py => lib.py +0 -73
@@ 1,73 0,0 @@-# -*- coding: utf-8 -*-
- import sys
- import re
- import json
- from multiprocessing import Pool, cpu_count
- 
- import requests
- 
- 
- def isurl(s):
-     if re.match(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', s):
-         return True
-     else:
-         return False
- 
-   
- def getWebArchiveLink(url):
-   # will need to handle cases that it can't find, like for http://w3techs.com/technologies/overview/javascript_library/all
-   if 'web.archive' in url:
-     return url,url
-   try:
-     r = requests.get('https://web.archive.org/save/' + url)
-     print("Got permanent link for " + url)
-   except:
-     return url,url
-   if r.status_code == 403:
-     return url,url
-   else:
-     try:
-       return url,'https://web.archive.org' + r.headers['content-location']
-     except:
-       print(url)
-       return url,url
-     
- 
- def replaceText(text_test,apikey):
-   urls = []
-   urls_in_order = []
-   for url in  re.findall(r'(https?://[^\s]+)', text_test):
-     newurl = url.split('"')[0].split('<')[0]
-     while newurl[-1] == '.' or newurl[-1] == ')' or newurl[-1] == '!':
-       newurl = newurl[:-1]
-     if not apikey:
-       urls.append(newurl)
-     else:
-       urls.append((newurl,apikey))
-     urls_in_order.append(newurl)
- 
- 
-   f = getWebArchiveLink
-   if apikey:
-     f = getPermaccLink
-   p = Pool(cpu_count())
-   conversion = {}
-   for result in p.map(f, list(set(urls))):
-     conversion[result[0]] = result[1]    
-   p.terminate()
- 
-   print(conversion)
-   curPos = 0
-   for url in urls_in_order:
-     if url in text_test[curPos:]:
-       print(url)
-       print(conversion[url])
-       print(text_test[curPos:])
-       newPos = text_test.index(url)
-       text_test = text_test[0:curPos] + text_test[curPos:].replace(url,conversion[url],1)
-       curPos = newPos
- 
-   return text_test  
- 
- 
- 

M plr.py => plr.py +58 -8
@@ 1,11 1,15 @@-from lib import *
+ # -*- coding: utf-8 -*-
+ import sys
  import argparse
+ import re
+ import json
+ import requests
  
  def parseargs():
      parser = argparse.ArgumentParser()
-     parser.add_argument('-if', nargs='?', type=argparse.FileType('r'),
+     parser.add_argument('-i', nargs='?', type=argparse.FileType('r'),
                          default=sys.stdin, action='store', dest='input')
-     parser.add_argument('-of', nargs='?', type=argparse.FileType('w'),
+     parser.add_argument('-o', nargs='?', type=argparse.FileType('w'),
                           default=sys.stdout, action='store', dest='output')
      return parser.parse_args()
  


@@ 18,8 22,54 @@ writeoutput = print
  
      content = args.input.read()
-     content = replaceText(content,False)
-     writeoutput(content)
-         
- if __name__ == '__main__':
-     main()
+     content = replaceLinks(content)
+     #writeoutput(content)
+ 
+ 
+ def isurl(s):
+     if re.match(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', s):
+         return True
+     else:
+         return False
+ 
+ def getWebArchiveLink(url):
+   if 'web.archive' in url:
+     return url
+   try:
+     r = requests.get('https://archive.org/wayback/available?url=' + url)
+     rjson = json.loads(r.content)
+   except:
+     return url
+   if not 'closest' in rjson['archived_snapshots']:
+     return url
+   else:
+     try:
+       return rjson['archived_snapshots']['closest']['url']
+     except:
+       return url
+ 
+ def replaceLinks(text):
+   urls = []
+   for url in  re.findall(r'(https?://[^\s]+)', text):
+     newurl = url.split('"')[0].split('<')[0]
+     while newurl[-1] == '.' or newurl[-1] == ')' or newurl[-1] == '!':
+       newurl = newurl[:-1]
+       urls.append(newurl)
+ 
+   newurls = []
+   for url in urls:
+       newurls.append(getWebArchiveLink(url))
+ 
+ #  print(conversion)
+   curPos = 0
+   for url, newurl in zip(urls,newurls):
+     if url in text[curPos:]:
+       print('url:' + url)
+       print('new url:' + newurl)
+       # print(text[curPos:])
+       newPos = text.index(url)
+       text = text[0:curPos] + text[curPos:].replace(url,newurl)
+       curPos = newPos
+   return text  
+ 
+ main()

D requirements.txt => requirements.txt +0 -7
@@ 1,7 0,0 @@-Flask==0.10.1
- itsdangerous==0.24
- Jinja2==2.8
- MarkupSafe==0.23
- requests==2.7.0
- Werkzeug==0.10.4
- wheel==0.24.0