Source code for nifigator.pdfparser

# -*- coding: utf-8 -*-

import regex
import logging
from collections import namedtuple
from io import BytesIO
from typing import Union

from lxml import etree
from pdfminer.converter import XMLConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage


[docs]class PDFDocument:
    def __init__(
        self,
        join_hyphenated_words: bool = True,
        ignore_control_characters: str = "[\x00-\x08\x0b-\x0c\x0e-\x1f]",
    ):
        self.join_hyphenated_words = join_hyphenated_words
        self.control_characters_to_ignore = regex.compile(ignore_control_characters)
        self.PDF_offset = namedtuple(
            "PDF_offset", ["pageNumber", "beginIndex", "endIndex"]
        )

[docs]    def parse(
        self,
        file: Union[str, BytesIO] = None,
        codec: str = "utf-8",
        password: str = "",
        laparams: LAParams = LAParams(),
    ):
        """Function to convert pdf to xml or text

        Args:

            file: location or stream of the file to be converted
            codec: codec to be used to conversion
            password: password to be used for conversion
            laparams: laparams for the pdfminer.six parser
            join_hyphenated_words: Join 'hyhen-\\n ated wor- \\nds' to 'hyphenated words'

        Returns:

        """
        rsrcmgr = PDFResourceManager()
        retstr = BytesIO()
        device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        if isinstance(file, str):
            fp = open(file, "rb")
        else:
            fp = BytesIO(file)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        maxpages = 0
        caching = True
        pagenos = set()
        for page in PDFPage.get_pages(
            fp,
            pagenos,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=False,
        ):
            interpreter.process_page(page)

        # in case the file is opened, it is closed (a stream is not closed)
        if not isinstance(file, BytesIO):
            fp.close()
        device.close()

        result = retstr.getvalue()
        retstr.close()

        self.tree = etree.fromstring(result)

        return self

[docs]    def open(self, input: Union[str, bytes]):
        """Function to open a PDFDocument in xml
        Args:
            input: the location of the PDFDocument in xml to be opened or a bytes object containing the file content
        """
        if isinstance(input, str):
            with open(input, "r", encoding="utf-8") as f:
                self.tree = etree.parse(f).getroot()
        elif type(input) == bytes:
            stream_data = io.BytesIO(input)
            self.tree = etree.parse(stream_data).getroot()
        else:
            raise TypeError(
                "invalid input, instead of bytes or string it is" + str(type(input))
            )
        return self

[docs]    def write(self, output: str) -> None:
        """Function to write an PDFDocument in xml
        Args:
            output: the location of the PDFDocument in xml to be stored
        """
        self.tree.getroottree().write(
            output, encoding="utf-8", pretty_print=True, xml_declaration=True
        )

[docs]    def getstream(self) -> bytes:
        """
        Function to stream the PDFDocument in xml
        Returns: Bytesstream of the PDFDocument in xml
        """
        output = io.BytesIO()
        super().write(output, encoding="utf-8", pretty_print=True, xml_declaration=True)
        return output

    @property
    def text(self):
        """
        Property to extract text from PDFDocument
        Return: str
        """
        # setup regexes
        _hyphens = "\u00AD\u058A\u05BE\u0F0C\u1400\u1806\u2010\u2011\u2012\u2e17\u30A0-"
        _hyphen_newline = regex.compile(
            r"(?<=\p{L})[" + _hyphens + "][ \t\u00a0\r]*\n{1,2}[ \t\u00a0]*(?=\\p{L})"
        )

        text = []
        for page in self.tree:
            for textbox in page:
                if textbox.tag == "textbox":
                    for textline in textbox:
                        for text_element in textline:
                            text.append(text_element.text)
                    text.append("\n")
                elif textbox.tag == "figure":
                    for text_element in textbox:
                        if (
                            text_element.text is not None
                            and text_element.text != "\n        "
                        ):
                            text.append(text_element.text)
                elif textbox.tag == "textline":
                    for text_element in textbox:
                        text.append(text_element.text)
        text = "".join([t for t in text if t is not None])

        # delete control characters
        text = self.control_characters_to_ignore.sub("", text)

        # delete hyphens
        if self.join_hyphenated_words:
            text = _hyphen_newline.subn("", text)[0]

        return text

    @property
    def page_offsets(self):
        """
        Property to extract page offsets from PDFDocument
        Return: list of PDF_offset elements (named tuples)
        """

        # setup regexes
        _hyphens = "\u00AD\u058A\u05BE\u0F0C\u1400\u1806\u2010\u2011\u2012\u2e17\u30A0-"
        _hyphen_newline = regex.compile(
            r"(?<=\p{L})[" + _hyphens + "][ \t\u00a0\r]*\n{1,2}[ \t\u00a0]*(?=\\p{L})"
        )

        page_offsets = []
        text = ""
        page_start_correction = 0
        page_end_correction = 0
        for idx, page in enumerate(self.tree):
            page_start = len(text)
            for textbox in page:
                if textbox.tag == "textbox":
                    for textline in textbox:
                        for text_element in textline:
                            if text_element.text is not None:
                                text += self.control_characters_to_ignore.sub(
                                    "", text_element.text
                                )
                    text += "\n"
                elif textbox.tag == "figure":
                    for text_element in textbox:
                        if (
                            text_element.text is not None
                            and text_element.text != "\n        "
                        ):
                            text += self.control_characters_to_ignore.sub(
                                "", text_element.text
                            )
                elif textbox.tag == "textline":
                    for text_element in textbox:
                        if text_element.text is not None:
                            text += self.control_characters_to_ignore.sub(
                                "", text_element.text
                            )
            page_end = len(text)

            if self.join_hyphenated_words:
                # retrieve all hyphens in text and calculate correction
                text_hyphens = regex.finditer(_hyphen_newline, text)
                page_end_correction = sum(
                    [hyphen.end() - hyphen.start() for hyphen in text_hyphens]
                )
                if logging.DEBUG and page_end_correction > 0:
                    logging.debug(
                        "nifigator.pdfparser.page_offsets: page_start "
                        + str(page_start)
                        + " corrected with "
                        + str(page_start_correction)
                    )
                    logging.debug(
                        "nifigator.pdfparser.page_offsets: page_end   "
                        + str(page_end)
                        + " corrected with "
                        + str(page_end_correction)
                    )
                # append corrected page offsets
                page_offsets.append(
                    self.PDF_offset(
                        idx + 1,
                        page_start - page_start_correction,
                        page_end - page_end_correction,
                    )
                )
                # set page_start_correction for next page
                page_start_correction = page_end_correction
            else:
                # append page offsets
                page_offsets.append(self.PDF_offset(idx, page_start, page_end))

        return page_offsets