~idmyn/bubbin_bot

0ad37f5d96f1a47cf121ac61a6b386fffd0122f9 — David Mynors 18 days ago a8131be main
Add pdf scanning functionality
8 files changed, 149 insertions(+), 11 deletions(-)

M .gitignore
M Dockerfile
M README.rst
M bubbin_bot/bot.py
A bubbin_bot/pdf.py
M poetry.lock
M pyproject.toml
M requirements.txt
M .gitignore => .gitignore +1 -0
@@ 1,3 1,4 @@
.env
.envrc
**/__pycache__
docker-compose.yaml

M Dockerfile => Dockerfile +18 -3
@@ 1,10 1,25 @@
FROM python:3.8
FROM python:3.8-slim

WORKDIR /usr/src/app
RUN apt-get update && apt-get install -y \
  tesseract-ocr \
  poppler-utils \
  ;

ENV TINI_VERSION v0.19.0
ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
RUN chmod +x /tini

WORKDIR /app

COPY requirements.txt .
RUN pip install -r requirements.txt

COPY . .

CMD [ "python", "-m", "bubbin_bot" ]
RUN useradd -ms /bin/bash svc
RUN chown -R svc /app
USER svc

ENTRYPOINT ["/tini", "--"]

CMD ["python", "-um", "bubbin_bot"]

M README.rst => README.rst +1 -2
@@ 5,8 5,7 @@ This is a Telegram bot that serves as a handy interface for a couple of
utilities. If sent an image, it'll resize it if necessary (to fit the Telegram
sticker restriction of 512 pixels), replace white pixels with transparency, and
return the modified image to the sender. If sent a pdf, it'll process the pages
with Python-tesseract_ and return a pdf that can be searched and highlighted
(WIP).
with Python-tesseract_ and return a pdf that can be searched and highlighted.

.. _Python-tesseract: https://github.com/madmaze/pytesseract


M bubbin_bot/bot.py => bubbin_bot/bot.py +42 -5
@@ 2,6 2,7 @@ from telegram.ext import Updater, CommandHandler, MessageHandler, Filters
import os
import requests
from . import img
from . import pdf
import logging
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    level=logging.INFO)


@@ 13,10 14,10 @@ ALLOWLIST = ['idmyn', 'bubbin']

def start(update, context):
    context.bot.send_message(chat_id=update.effective_chat.id,
                             text="I'm a bot, send me pics!")
                             text="I'm a bot, send me PDFs and pics!")


def image(update, context):
def handle_image(update, context):
    context.bot.send_message(chat_id=update.effective_chat.id,
                             text="nice pic")



@@ 31,26 32,62 @@ def image(update, context):
    r = requests.get(file_path_url)
    file_path = r.json()['result']['file_path']
    file_url = 'https://api.telegram.org/file/bot{0}/{1}'.format(TOKEN, file_path)
    image = requests.get(file_url)
    image = requests.get(file_url).content

    bytes = img.white_to_transparent(image.content)
    bytes = img.white_to_transparent(image)
    bytes.name = "img.png"
    bytes.seek(0)
    context.bot.send_document(chat_id=update.effective_chat.id,
                              document=bytes)


def handle_pdf(update, context):
    context.bot.send_message(chat_id=update.effective_chat.id,
                             text="nice pdf")

    if update.effective_chat.username not in ALLOWLIST:
        return  # I don't want to process PDFs from strangers

    context.bot.send_message(chat_id=update.effective_chat.id,
                             text="gimmie a sec...")

    file_name = update.message.document.file_name
    file_id = update.message.document.file_id

    file_path_url = 'https://api.telegram.org/bot{0}/getFile?file_id={1}'.format(TOKEN, file_id)
    r = requests.get(file_path_url)
    file_path = r.json()['result']['file_path']
    file_url = 'https://api.telegram.org/file/bot{0}/{1}'.format(TOKEN, file_path)
    original_pdf = requests.get(file_url).content

    bytes = pdf.scan(original_pdf)

    bytes.name = f'scanned-{file_name}'
    bytes.seek(0)
    context.bot.send_document(chat_id=update.effective_chat.id,
                              document=bytes)


def error_handler(update, context):
    context.bot.send_message(chat_id=update.effective_chat.id,
                             text="uh oh, something went wrong...")


def run():
    print("running...")

    updater = Updater(TOKEN, use_context=True)
    dispatcher = updater.dispatcher
    dispatcher.add_error_handler(error_handler)

    start_handler = CommandHandler('start', start)
    dispatcher.add_handler(start_handler)

    image_handler = MessageHandler(Filters.photo, image)
    image_handler = MessageHandler(Filters.photo, handle_image)
    dispatcher.add_handler(image_handler)

    pdf_handler = MessageHandler(Filters.document.pdf, handle_pdf)
    dispatcher.add_handler(pdf_handler)

    updater.start_polling()
    updater.idle()

A bubbin_bot/pdf.py => bubbin_bot/pdf.py +36 -0
@@ 0,0 1,36 @@
import os
import tempfile
import PyPDF2
import pdf2image
import pytesseract
from io import BytesIO


def scan(document):
    with tempfile.TemporaryDirectory() as dest:
        images = pdf2image.convert_from_bytes(document, output_folder=dest)

        pdf_paths = images_to_pdf_paths(images, dest)

        merger = PyPDF2.PdfFileMerger()
        for pdf_path in pdf_paths:
            merger.append(pdf_path)

        bytes = BytesIO()
        merger.write(bytes)
        return bytes


def images_to_pdf_paths(images, output_folder_path):
    pdf_paths = []
    for i, image in enumerate(images):
        i += 10 # to make it easier to sort the filepath strings later
        image_path = f'{output_folder_path}/{i}.jpg'
        image.save(image_path)

        pdf = pytesseract.image_to_pdf_or_hocr(image_path, extension='pdf')
        pdf_path = f'{output_folder_path}/{i}.pdf'
        with open(pdf_path, 'w+b') as f:
            f.write(pdf)
        pdf_paths.append(pdf_path)
    return pdf_paths

M poetry.lock => poetry.lock +41 -1
@@ 122,6 122,17 @@ six = "*"

[[package]]
category = "main"
description = "A wrapper around the pdftoppm and pdftocairo command line tools to convert PDF to a PIL Image list."
name = "pdf2image"
optional = false
python-versions = "*"
version = "1.13.1"

[package.dependencies]
pillow = "*"

[[package]]
category = "main"
description = "Python Imaging Library (Fork)"
name = "pillow"
optional = false


@@ 164,6 175,25 @@ python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
version = "2.4.6"

[[package]]
category = "main"
description = "PDF toolkit"
name = "pypdf2"
optional = false
python-versions = "*"
version = "1.26.0"

[[package]]
category = "main"
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
name = "pytesseract"
optional = false
python-versions = "*"
version = "0.3.4"

[package.dependencies]
Pillow = "*"

[[package]]
category = "dev"
description = "pytest: simple powerful testing with Python"
name = "pytest"


@@ 260,7 290,7 @@ python-versions = "*"
version = "0.1.8"

[metadata]
content-hash = "f159089b7ad88d8c253d3ecd3f246937d4b3e921404a95cfd0edbe91181f0a2d"
content-hash = "86880a095e9ff13fc84e1d9ef0a5998abc8a54f8d78e28f04633b2a8058a307e"
python-versions = "^3.8"

[metadata.files]


@@ 356,6 386,10 @@ packaging = [
    {file = "packaging-20.1-py2.py3-none-any.whl", hash = "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73"},
    {file = "packaging-20.1.tar.gz", hash = "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334"},
]
pdf2image = [
    {file = "pdf2image-1.13.1-py3-none-any.whl", hash = "sha256:ed2935991de449e55ceea2eff7c5d18c7b5cde4a2f6b9f3d56a430e8c5b77969"},
    {file = "pdf2image-1.13.1.tar.gz", hash = "sha256:df6b825f7f26df35b873642725a7ee37dfc8a531b711274a8ad2ee830c8b72d0"},
]
pillow = [
    {file = "Pillow-7.0.0-cp35-cp35m-macosx_10_6_intel.whl", hash = "sha256:5f3546ceb08089cedb9e8ff7e3f6a7042bb5b37c2a95d392fb027c3e53a2da00"},
    {file = "Pillow-7.0.0-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:9d2ba4ed13af381233e2d810ff3bab84ef9f18430a9b336ab69eaf3cd24299ff"},


@@ 395,6 429,12 @@ pyparsing = [
    {file = "pyparsing-2.4.6-py2.py3-none-any.whl", hash = "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"},
    {file = "pyparsing-2.4.6.tar.gz", hash = "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f"},
]
pypdf2 = [
    {file = "PyPDF2-1.26.0.tar.gz", hash = "sha256:e28f902f2f0a1603ea95ebe21dff311ef09be3d0f0ef29a3e44a932729564385"},
]
pytesseract = [
    {file = "pytesseract-0.3.4.tar.gz", hash = "sha256:afd8a5cdf8ab5d35690efbe71cbf5f89419f668ea8dde7649149815d5c5a899a"},
]
pytest = [
    {file = "pytest-5.3.5-py3-none-any.whl", hash = "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"},
    {file = "pytest-5.3.5.tar.gz", hash = "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d"},

M pyproject.toml => pyproject.toml +3 -0
@@ 12,6 12,9 @@ python = "^3.8"
pillow = "^7.0.0"
python-telegram-bot = "^12.4.2"
requests = "^2.23.0"
PyPDF2 = "^1.26.0"
pdf2image = "^1.13.1"
pytesseract = "^0.3.4"

[tool.poetry.dev-dependencies]
pytest = "^5.2"

M requirements.txt => requirements.txt +7 -0
@@ 63,6 63,9 @@ future==0.18.2 \
idna==2.9 \
    --hash=sha256:a068a21ceac8a4d63dbfd964670474107f541babbd2250d61922f029858365fa \
    --hash=sha256:7588d1c14ae4c77d74036e8c22ff447b26d0fde8f007354fd48a7814db15b7cb
pdf2image==1.13.1 \
    --hash=sha256:ed2935991de449e55ceea2eff7c5d18c7b5cde4a2f6b9f3d56a430e8c5b77969 \
    --hash=sha256:df6b825f7f26df35b873642725a7ee37dfc8a531b711274a8ad2ee830c8b72d0
pillow==7.0.0 \
    --hash=sha256:5f3546ceb08089cedb9e8ff7e3f6a7042bb5b37c2a95d392fb027c3e53a2da00 \
    --hash=sha256:9d2ba4ed13af381233e2d810ff3bab84ef9f18430a9b336ab69eaf3cd24299ff \


@@ 88,6 91,10 @@ pillow==7.0.0 \
    --hash=sha256:4d9ed9a64095e031435af120d3c910148067087541131e82b3e8db302f4c8946
pycparser==2.19 \
    --hash=sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3
pypdf2==1.26.0 \
    --hash=sha256:e28f902f2f0a1603ea95ebe21dff311ef09be3d0f0ef29a3e44a932729564385
pytesseract==0.3.4 \
    --hash=sha256:afd8a5cdf8ab5d35690efbe71cbf5f89419f668ea8dde7649149815d5c5a899a
python-telegram-bot==12.4.2 \
    --hash=sha256:0a97cbca638f949582b4ee326170d2f8d7f4bf559a4e511132bb2203623e04ad \
    --hash=sha256:d3cffd020af4094d07c11783f875e5c682072ba7f5bc21ce89ff0222f4e6d742