TLDR-903 upgrade PyPDF2 to pypdf>4; fix bug with PDF attachments (#515)

NastyBoget · web-flow · commit 473a6a7b71d6 · 2025-02-14T17:21:36.000+03:00
diff --git a/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py b/dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py
@@ -1,6 +1,6 @@
 from typing import List, Optional, Tuple
 
-from PyPDF2.pdf import PageObject, PdfFileReader
+from pypdf import PageObject, PdfReader
 
 from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
 from dedoc.data_structures.attached_file import AttachedFile
@@ -22,15 +22,15 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
         the methods' parameters.
         """
         import os
-        from PyPDF2.utils import PdfReadError
         from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
+        from pypdf.errors import PdfReadError
 
         parameters = {} if parameters is None else parameters
         filename = os.path.basename(file_path)
 
         with open(file_path, "rb") as handler:
             try:
-                reader = PdfFileReader(handler)
+                reader = PdfReader(handler)
             except Exception as e:
                 self.logger.warning(f"can't handle {filename}, get {e}")
                 return []
@@ -55,13 +55,13 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
         if "/Annots" in page.keys():
             for annot in page["/Annots"]:
                 # Other subtypes, such as /Link, cause errors
-                subtype = annot.getObject().get("/Subtype")
+                subtype = annot.get_object().get("/Subtype")
                 if subtype == "/FileAttachment":
-                    name = annot.getObject()["/FS"]["/UF"]
-                    data = annot.getObject()["/FS"]["/EF"]["/F"].getData()  # The file containing the stream data.
+                    name = annot.get_object()["/FS"]["/UF"]
+                    data = annot.get_object()["/FS"]["/EF"]["/F"].get_data()  # The file containing the stream data.
                     attachments.append([name, data])
-                if subtype == "/Text" and annot.getObject().get("/Name") == "/Comment":  # it is messages (notes) in PDF
-                    note = annot.getObject()
+                if subtype == "/Text" and annot.get_object().get("/Name") == "/Comment":  # it is messages (notes) in PDF
+                    note = annot.get_object()
                     created_time = convert_datetime(note["/CreationDate"]) if "/CreationDate" in note else None
                     modified_time = convert_datetime(note["/M"]) if "/M" in note else None
                     user = note.get("/T")
@@ -71,17 +71,15 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
                     attachments.append((name, bytes(content)))
         return attachments
 
-    def __get_page_level_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
-        cnt_page = reader.getNumPages()
+    def __get_page_level_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
         attachments = []
-        for i in range(cnt_page):
-            page = reader.getPage(i)
+        for page in reader.pages:
             attachments_on_page = self.__get_notes(page)
             attachments.extend(attachments_on_page)
 
         return attachments
 
-    def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
+    def __get_root_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
         """
         Retrieves the file attachments of the PDF as a dictionary of file names and the file data as a bytestring.
 
@@ -96,9 +94,9 @@ def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes
             for f in file_names:
                 if isinstance(f, str):
                     data_index = file_names.index(f) + 1
-                    dict_object = file_names[data_index].getObject()
+                    dict_object = file_names[data_index].get_object()
                     if "/EF" in dict_object and "/F" in dict_object["/EF"]:
-                        data = dict_object["/EF"]["/F"].getData()
+                        data = dict_object["/EF"]["/F"].get_data()
                         name = dict_object.get("/UF", f"pdf_attach_{uuid.uuid4()}")
                         attachments.append((name, data))
 
diff --git a/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py b/dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py
@@ -58,13 +58,13 @@ def extract(self,
         return result
 
     def _get_pdf_info(self, path: str) -> dict:
-        from PyPDF2 import PdfFileReader
-        from PyPDF2.utils import PdfReadError
+        from pypdf import PdfReader
+        from pypdf.errors import PdfReadError
 
         try:
             with open(path, "rb") as file:
-                document = PdfFileReader(file)
-                document_info = document.getDocumentInfo() if document.getDocumentInfo() is not None else {}
+                document = PdfReader(file)
+                document_info = document.metadata if document.metadata is not None else {}
                 result = self.__prettify_metadata(document_info)
             return result
         except PdfReadError:
diff --git a/docs/source/_static/code_examples/pdf_attachment_extractor.py b/docs/source/_static/code_examples/pdf_attachment_extractor.py
@@ -1,8 +1,6 @@
 import os
 from typing import List, Optional
 
-import PyPDF2
-
 from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
 from dedoc.data_structures import AttachedFile
 from dedoc.extensions import recognized_extensions, recognized_mimes
@@ -20,21 +18,26 @@ def can_extract(self,
         return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format
 
     def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
+        from pypdf import PdfReader
+
         parameters = {} if parameters is None else parameters
-        handler = open(os.path.join(file_path), "rb")
-        reader = PyPDF2.PdfFileReader(handler)
-        catalog = reader.trailer["/Root"]
-        attachments = []
-        if "/Names" not in catalog or "/EmbeddedFiles" not in catalog["/Names"]:
-            return attachments
-        filenames = catalog["/Names"]["/EmbeddedFiles"]["/Names"]
-        for filename in filenames:
-            if isinstance(filename, str):
-                name = filename
-                data_index = filenames.index(filename) + 1
-                f_dict = filenames[data_index].getObject()
-                f_data = f_dict["/EF"]["/F"].getData()
-                attachments.append((name, f_data))
+        with open(os.path.join(file_path), "rb") as f:
+            reader = PdfReader(f)
+            catalog = reader.trailer["/Root"]
+
+            if "/Names" not in catalog or "/EmbeddedFiles" not in catalog["/Names"]:
+                return []
+
+            attachments = []
+            filenames = catalog["/Names"]["/EmbeddedFiles"]["/Names"]
+            for filename in filenames:
+                if isinstance(filename, str):
+                    name = filename
+                    data_index = filenames.index(filename) + 1
+                    f_dict = filenames[data_index].get_object()
+                    f_data = f_dict["/EF"]["/F"].get_data()
+                    attachments.append((name, f_data))
+
         attachments_dir = get_param_attachments_dir(parameters, file_path)
         need_content_analysis = get_param_need_content_analysis(parameters)
         attachments = self._content2attach_file(content=attachments, tmpdir=attachments_dir, need_content_analysis=need_content_analysis, parameters=parameters)
diff --git a/docs/source/_static/code_examples/pdf_reader.py b/docs/source/_static/code_examples/pdf_reader.py
@@ -1,7 +1,6 @@
 from typing import List, Optional
 
 import tabula
-from PyPDF2 import PdfFileReader
 from pdf_attachment_extractor import PdfAttachmentsExtractor
 
 from dedoc.data_structures import CellWithMeta, LineMetadata
@@ -41,13 +40,12 @@ def __process_tables(self, path: str) -> List[Table]:
         return tables
 
     def __process_lines(self, path: str) -> List[LineWithMeta]:
+        from pypdf import PdfReader as PdfFileReader
         with open(path, "rb") as file:
             lines_with_meta = []
             pdf = PdfFileReader(file)
-            num_pages = pdf.getNumPages()
-            for page_id in range(num_pages):
-                page = pdf.getPage(page_id)
-                text = page.extractText()
+            for page_id, page in enumerate(pdf.pages):
+                text = page.extract_text()
                 lines = text.split("\n")
                 for line_id, line in enumerate(lines):
                     metadata = LineMetadata(page_id=page_id, line_id=line_id)
diff --git a/docs/source/tutorials/add_new_doc_format.rst b/docs/source/tutorials/add_new_doc_format.rst
@@ -185,7 +185,7 @@ You should implement the following methods:
 For each line, you need to add its text, metadata, hierarchy level (if exists) and annotations (if exist).
 For tables, you need to add a list of rows (each row is a list of table cells) and metadata.
 You can use :ref:`dedoc_data_structures` to learn more about all the described structures.
-We use `PyPDF2 <https://pypdf2.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
+We use `pypdf <https://pypdf.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
 They must be added to ``requirements.txt`` of the project.
 We use class ``PdfAttachmentsExtractor`` for attachments extraction (it was mentioned before).
 It must be added to the reader's constructor and used in ``read`` method.
diff --git a/requirements.txt b/requirements.txt
@@ -17,8 +17,7 @@ pdfminer.six>=20211012,<=20231228
 piexif==1.1.3
 puremagic>=1.0,<2.0 # needs libmagic to be installed in the system
 pylzma==0.5.0
-pypdf>=3.17.0,<=4.1.0
-PyPDF2==1.27.0
+pypdf>=4.0,<6.0
 pytesseract==0.3.10
 python-docx==0.8.11
 python-Levenshtein==0.12.2