Skip to content

Commit 473a6a7

Browse files
authored
TLDR-903 upgrade PyPDF2 to pypdf>4; fix bug with PDF attachments (#515)
1 parent c074035 commit 473a6a7

File tree

6 files changed

+41
-43
lines changed

6 files changed

+41
-43
lines changed

dedoc/attachments_extractors/concrete_attachments_extractors/pdf_attachments_extractor.py

+13-15
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from typing import List, Optional, Tuple
22

3-
from PyPDF2.pdf import PageObject, PdfFileReader
3+
from pypdf import PageObject, PdfReader
44

55
from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
66
from dedoc.data_structures.attached_file import AttachedFile
@@ -22,15 +22,15 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
2222
the methods' parameters.
2323
"""
2424
import os
25-
from PyPDF2.utils import PdfReadError
2625
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
26+
from pypdf.errors import PdfReadError
2727

2828
parameters = {} if parameters is None else parameters
2929
filename = os.path.basename(file_path)
3030

3131
with open(file_path, "rb") as handler:
3232
try:
33-
reader = PdfFileReader(handler)
33+
reader = PdfReader(handler)
3434
except Exception as e:
3535
self.logger.warning(f"can't handle {filename}, get {e}")
3636
return []
@@ -55,13 +55,13 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
5555
if "/Annots" in page.keys():
5656
for annot in page["/Annots"]:
5757
# Other subtypes, such as /Link, cause errors
58-
subtype = annot.getObject().get("/Subtype")
58+
subtype = annot.get_object().get("/Subtype")
5959
if subtype == "/FileAttachment":
60-
name = annot.getObject()["/FS"]["/UF"]
61-
data = annot.getObject()["/FS"]["/EF"]["/F"].getData() # The file containing the stream data.
60+
name = annot.get_object()["/FS"]["/UF"]
61+
data = annot.get_object()["/FS"]["/EF"]["/F"].get_data() # The file containing the stream data.
6262
attachments.append([name, data])
63-
if subtype == "/Text" and annot.getObject().get("/Name") == "/Comment": # it is messages (notes) in PDF
64-
note = annot.getObject()
63+
if subtype == "/Text" and annot.get_object().get("/Name") == "/Comment": # it is messages (notes) in PDF
64+
note = annot.get_object()
6565
created_time = convert_datetime(note["/CreationDate"]) if "/CreationDate" in note else None
6666
modified_time = convert_datetime(note["/M"]) if "/M" in note else None
6767
user = note.get("/T")
@@ -71,17 +71,15 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
7171
attachments.append((name, bytes(content)))
7272
return attachments
7373

74-
def __get_page_level_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
75-
cnt_page = reader.getNumPages()
74+
def __get_page_level_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
7675
attachments = []
77-
for i in range(cnt_page):
78-
page = reader.getPage(i)
76+
for page in reader.pages:
7977
attachments_on_page = self.__get_notes(page)
8078
attachments.extend(attachments_on_page)
8179

8280
return attachments
8381

84-
def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
82+
def __get_root_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
8583
"""
8684
Retrieves the file attachments of the PDF as a dictionary of file names and the file data as a bytestring.
8785
@@ -96,9 +94,9 @@ def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes
9694
for f in file_names:
9795
if isinstance(f, str):
9896
data_index = file_names.index(f) + 1
99-
dict_object = file_names[data_index].getObject()
97+
dict_object = file_names[data_index].get_object()
10098
if "/EF" in dict_object and "/F" in dict_object["/EF"]:
101-
data = dict_object["/EF"]["/F"].getData()
99+
data = dict_object["/EF"]["/F"].get_data()
102100
name = dict_object.get("/UF", f"pdf_attach_{uuid.uuid4()}")
103101
attachments.append((name, data))
104102

dedoc/metadata_extractors/concrete_metadata_extractors/pdf_metadata_extractor.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -58,13 +58,13 @@ def extract(self,
5858
return result
5959

6060
def _get_pdf_info(self, path: str) -> dict:
61-
from PyPDF2 import PdfFileReader
62-
from PyPDF2.utils import PdfReadError
61+
from pypdf import PdfReader
62+
from pypdf.errors import PdfReadError
6363

6464
try:
6565
with open(path, "rb") as file:
66-
document = PdfFileReader(file)
67-
document_info = document.getDocumentInfo() if document.getDocumentInfo() is not None else {}
66+
document = PdfReader(file)
67+
document_info = document.metadata if document.metadata is not None else {}
6868
result = self.__prettify_metadata(document_info)
6969
return result
7070
except PdfReadError:

docs/source/_static/code_examples/pdf_attachment_extractor.py

+19-16
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
import os
22
from typing import List, Optional
33

4-
import PyPDF2
5-
64
from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
75
from dedoc.data_structures import AttachedFile
86
from dedoc.extensions import recognized_extensions, recognized_mimes
@@ -20,21 +18,26 @@ def can_extract(self,
2018
return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format
2119

2220
def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
21+
from pypdf import PdfReader
22+
2323
parameters = {} if parameters is None else parameters
24-
handler = open(os.path.join(file_path), "rb")
25-
reader = PyPDF2.PdfFileReader(handler)
26-
catalog = reader.trailer["/Root"]
27-
attachments = []
28-
if "/Names" not in catalog or "/EmbeddedFiles" not in catalog["/Names"]:
29-
return attachments
30-
filenames = catalog["/Names"]["/EmbeddedFiles"]["/Names"]
31-
for filename in filenames:
32-
if isinstance(filename, str):
33-
name = filename
34-
data_index = filenames.index(filename) + 1
35-
f_dict = filenames[data_index].getObject()
36-
f_data = f_dict["/EF"]["/F"].getData()
37-
attachments.append((name, f_data))
24+
with open(os.path.join(file_path), "rb") as f:
25+
reader = PdfReader(f)
26+
catalog = reader.trailer["/Root"]
27+
28+
if "/Names" not in catalog or "/EmbeddedFiles" not in catalog["/Names"]:
29+
return []
30+
31+
attachments = []
32+
filenames = catalog["/Names"]["/EmbeddedFiles"]["/Names"]
33+
for filename in filenames:
34+
if isinstance(filename, str):
35+
name = filename
36+
data_index = filenames.index(filename) + 1
37+
f_dict = filenames[data_index].get_object()
38+
f_data = f_dict["/EF"]["/F"].get_data()
39+
attachments.append((name, f_data))
40+
3841
attachments_dir = get_param_attachments_dir(parameters, file_path)
3942
need_content_analysis = get_param_need_content_analysis(parameters)
4043
attachments = self._content2attach_file(content=attachments, tmpdir=attachments_dir, need_content_analysis=need_content_analysis, parameters=parameters)

docs/source/_static/code_examples/pdf_reader.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
from typing import List, Optional
22

33
import tabula
4-
from PyPDF2 import PdfFileReader
54
from pdf_attachment_extractor import PdfAttachmentsExtractor
65

76
from dedoc.data_structures import CellWithMeta, LineMetadata
@@ -41,13 +40,12 @@ def __process_tables(self, path: str) -> List[Table]:
4140
return tables
4241

4342
def __process_lines(self, path: str) -> List[LineWithMeta]:
43+
from pypdf import PdfReader as PdfFileReader
4444
with open(path, "rb") as file:
4545
lines_with_meta = []
4646
pdf = PdfFileReader(file)
47-
num_pages = pdf.getNumPages()
48-
for page_id in range(num_pages):
49-
page = pdf.getPage(page_id)
50-
text = page.extractText()
47+
for page_id, page in enumerate(pdf.pages):
48+
text = page.extract_text()
5149
lines = text.split("\n")
5250
for line_id, line in enumerate(lines):
5351
metadata = LineMetadata(page_id=page_id, line_id=line_id)

docs/source/tutorials/add_new_doc_format.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ You should implement the following methods:
185185
For each line, you need to add its text, metadata, hierarchy level (if exists) and annotations (if exist).
186186
For tables, you need to add a list of rows (each row is a list of table cells) and metadata.
187187
You can use :ref:`dedoc_data_structures` to learn more about all the described structures.
188-
We use `PyPDF2 <https://pypdf2.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
188+
We use `pypdf <https://pypdf.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
189189
They must be added to ``requirements.txt`` of the project.
190190
We use class ``PdfAttachmentsExtractor`` for attachments extraction (it was mentioned before).
191191
It must be added to the reader's constructor and used in ``read`` method.

requirements.txt

+1-2
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,7 @@ pdfminer.six>=20211012,<=20231228
1717
piexif==1.1.3
1818
puremagic>=1.0,<2.0 # needs libmagic to be installed in the system
1919
pylzma==0.5.0
20-
pypdf>=3.17.0,<=4.1.0
21-
PyPDF2==1.27.0
20+
pypdf>=4.0,<6.0
2221
pytesseract==0.3.10
2322
python-docx==0.8.11
2423
python-Levenshtein==0.12.2

0 commit comments

Comments
 (0)