~n0mn0m/archivist

archivist/Extractor/PdfExtractor.cs -rw-r--r-- 982 bytes
fd0058b7n0mn0m Clean up namespaces. 8 months ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
using System;
using System.Collections.Generic;
using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;

namespace Extractor
{
    public class PdfExtractor : ITextExtractor<string>
    {
        private string _inputFile;
        public PdfExtractor(string inputFile)
        {
           _inputFile = inputFile;
        }

        public ICollection<string> ExtractText()
        {
            var pdfReader = new PdfReader(_inputFile);
            var pdfDoc = new PdfDocument(pdfReader);
            var extractedLines = new List<string>();
            for (var page = 1; page <= pdfDoc.GetNumberOfPages(); page++)
            {
                var strategy = new SimpleTextExtractionStrategy();
                extractedLines.Add(PdfTextExtractor.GetTextFromPage(pdfDoc.GetPage(page), strategy));
            }
            pdfDoc.Close();
            pdfReader.Close();
            return extractedLines;
        }  
    }
}