~wrycode/plr

f9e4bbbe06e809f22aaea6356898f86ea2200c58 — Nick Econopouly 5 months ago 2be2df0
added conversion of command-line urls
1 files changed, 39 insertions(+), 31 deletions(-)

M plr.py
M plr.py => plr.py +39 -31
@@ 7,24 7,35 @@ import requests

def parseargs():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', nargs='?', type=argparse.FileType('r'),
                        default=sys.stdin, action='store', dest='input')
    parser.add_argument('-o', nargs='?', type=argparse.FileType('w'),
                         default=sys.stdout, action='store', dest='output')
    parser.add_argument('-i', '--input-file', type=argparse.FileType('r'),
                        default=sys.stdin, action='store', dest='input',
                        help='specify an input file (default: stdin)')
    parser.add_argument('-o', '--output-file', type=argparse.FileType('w'),
                        default=sys.stdout, action='store', dest='output',
                        help='specify an output file (default: stdout)')
    parser.add_argument('-l', action='store_true',
                        help='output a list of archive links instead of replacing them \
                        in the text')
    parser.add_argument('urls', nargs=argparse.REMAINDER)
    return parser.parse_args()

def main():
    # get command-line arguments
    args = parseargs()
    linkfunc = getWebArchiveLink

    # set where to write output
    if args.output.name != '<stdout>':
        writeoutput = args.output.write
    else:
        writeoutput = print

    content = args.input.read()
    content = replaceLinks(content)
    #writeoutput(content)

    # read the input from extra command-line args, stdin, or a file
    if len(args.urls) != 0 :
        # Ugly workaround because the regex is written for markdown links
        content ='\n'.join(args.urls)
    else:
        content = args.input.read()
    writeoutput(replaceLinks(content))

def isurl(s):
    if re.match(r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', s):


@@ 49,27 60,24 @@ def getWebArchiveLink(url):
      return url

def replaceLinks(text):
  urls = []
  for url in  re.findall(r'(https?://[^\s]+)', text):
    newurl = url.split('"')[0].split('<')[0]
    while newurl[-1] == '.' or newurl[-1] == ')' or newurl[-1] == '!':
      newurl = newurl[:-1]
      urls.append(newurl)

  newurls = []
  for url in urls:
      newurls.append(getWebArchiveLink(url))
    urls = []
    # Regex needs a rewrite:
    for url in re.findall(r'(https?://[^\s]+)', text):
        newurl = url.split('"')[0].split('<')[0]
        while newurl[-1] == '.' or newurl[-1] == ')' or newurl[-1] == '!':
            newurl = newurl[:-1]
            urls.append(newurl)
    newurls = []
    for url in urls:
        newurls.append(getWebArchiveLink(url))
        curPos = 0
        for url, newurl in zip(urls,newurls):
            if url in text[curPos:]:
                newPos = text.index(url)
                text = text[0:curPos] + text[curPos:].replace(url,newurl)
                curPos = newPos
    return text

#  print(conversion)
  curPos = 0
  for url, newurl in zip(urls,newurls):
    if url in text[curPos:]:
      print('url:' + url)
      print('new url:' + newurl)
      # print(text[curPos:])
      newPos = text.index(url)
      text = text[0:curPos] + text[curPos:].replace(url,newurl)
      curPos = newPos
  return text  

main()
if __name__ == '__main__':
    main()