~wrycode/plr

ref: 47d52497b345133a802290ef1c27eae4aa0a2547 plr/lib.py -rw-r--r-- 2.4 KiB View raw
47d52497Zack Scholl Fixed up the site, added to READMEy 4 years ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
import sys
import re
import json
from multiprocessing import Pool, cpu_count

import requests


def isurl(s):
    if re.match(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', s):
        return True
    else:
        return False

  
def getWebArchiveLink(url):
  # will need to handle cases that it can't find, like for http://w3techs.com/technologies/overview/javascript_library/all
  if 'web.archive' in url:
    return url,url
  try:
    r = requests.get('https://web.archive.org/save/' + url)
    print "Got permanent link for " + url
  except:
    return url,url
  if r.status_code == 403:
    return url,url
  else:
    try:
      return url,'https://web.archive.org' + r.headers['content-location']
    except:
      print url
      return url,url
    
def getPermaccLink(dat):
  # will need to handle cases that it can't find, like for http://w3techs.com/technologies/overview/javascript_library/all
  url = dat[0]
  apikey = dat[1]
  payload = {'url': url, 'title': url}
  permacc_url =  'https://api.perma.cc/v1/archives/?api_key=' + apikey
  r = requests.post(permacc_url, data = json.dumps(payload))
  print r.status_code
  if r.status_code == 201:
    result = json.loads(r.text)
    print json.dumps(result,indent=4)
    return url,str('http://perma.cc/' + result['guid'] + '?type=source')
  else:
    return url,url
  

def replaceText(text_test,apikey):
  urls = []
  urls_in_order = []
  for url in  re.findall(r'(https?://[^\s]+)', text_test):
    newurl = url.split('"')[0].split('<')[0]
    while newurl[-1] == '.' or newurl[-1] == ')' or newurl[-1] == '!':
      newurl = newurl[:-1]
    if not apikey:
      urls.append(newurl)
    else:
      urls.append((newurl,apikey))
    urls_in_order.append(newurl)


  f = getWebArchiveLink
  if apikey:
    f = getPermaccLink
  p = Pool(cpu_count())
  conversion = {}
  for result in p.map(f, list(set(urls))):
    conversion[result[0]] = result[1]    
  p.terminate()

  print conversion
  curPos = 0
  for url in urls_in_order:
    if url in text_test[curPos:]:
      print url
      print conversion[url]
      print text_test[curPos:]
      newPos = text_test.index(url)
      text_test = text_test[0:curPos] + text_test[curPos:].replace(url,conversion[url],1)
      curPos = newPos

  return text_test