Skip to content

TLDR-903 upgrade PyPDF2 to pypdf>4; fix bug with PDF attachments #515

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import List, Optional, Tuple

from PyPDF2.pdf import PageObject, PdfFileReader
from pypdf import PageObject, PdfReader

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures.attached_file import AttachedFile
Expand All @@ -22,15 +22,15 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
the methods' parameters.
"""
import os
from PyPDF2.utils import PdfReadError
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
from pypdf.errors import PdfReadError

parameters = {} if parameters is None else parameters
filename = os.path.basename(file_path)

with open(file_path, "rb") as handler:
try:
reader = PdfFileReader(handler)
reader = PdfReader(handler)
except Exception as e:
self.logger.warning(f"can't handle {filename}, get {e}")
return []
Expand All @@ -55,13 +55,13 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
if "/Annots" in page.keys():
for annot in page["/Annots"]:
# Other subtypes, such as /Link, cause errors
subtype = annot.getObject().get("/Subtype")
subtype = annot.get_object().get("/Subtype")
if subtype == "/FileAttachment":
name = annot.getObject()["/FS"]["/UF"]
data = annot.getObject()["/FS"]["/EF"]["/F"].getData() # The file containing the stream data.
name = annot.get_object()["/FS"]["/UF"]
data = annot.get_object()["/FS"]["/EF"]["/F"].get_data() # The file containing the stream data.
attachments.append([name, data])
if subtype == "/Text" and annot.getObject().get("/Name") == "/Comment": # it is messages (notes) in PDF
note = annot.getObject()
if subtype == "/Text" and annot.get_object().get("/Name") == "/Comment": # it is messages (notes) in PDF
note = annot.get_object()
created_time = convert_datetime(note["/CreationDate"]) if "/CreationDate" in note else None
modified_time = convert_datetime(note["/M"]) if "/M" in note else None
user = note.get("/T")
Expand All @@ -71,17 +71,15 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
attachments.append((name, bytes(content)))
return attachments

def __get_page_level_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
cnt_page = reader.getNumPages()
def __get_page_level_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
attachments = []
for i in range(cnt_page):
page = reader.getPage(i)
for page in reader.pages:
attachments_on_page = self.__get_notes(page)
attachments.extend(attachments_on_page)

return attachments

def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes]]:
def __get_root_attachments(self, reader: PdfReader) -> List[Tuple[str, bytes]]:
"""
Retrieves the file attachments of the PDF as a dictionary of file names and the file data as a bytestring.

Expand All @@ -96,9 +94,9 @@ def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes
for f in file_names:
if isinstance(f, str):
data_index = file_names.index(f) + 1
dict_object = file_names[data_index].getObject()
dict_object = file_names[data_index].get_object()
if "/EF" in dict_object and "/F" in dict_object["/EF"]:
data = dict_object["/EF"]["/F"].getData()
data = dict_object["/EF"]["/F"].get_data()
name = dict_object.get("/UF", f"pdf_attach_{uuid.uuid4()}")
attachments.append((name, data))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,13 +58,13 @@ def extract(self,
return result

def _get_pdf_info(self, path: str) -> dict:
from PyPDF2 import PdfFileReader
from PyPDF2.utils import PdfReadError
from pypdf import PdfReader
from pypdf.errors import PdfReadError

try:
with open(path, "rb") as file:
document = PdfFileReader(file)
document_info = document.getDocumentInfo() if document.getDocumentInfo() is not None else {}
document = PdfReader(file)
document_info = document.metadata if document.metadata is not None else {}
result = self.__prettify_metadata(document_info)
return result
except PdfReadError:
Expand Down
35 changes: 19 additions & 16 deletions docs/source/_static/code_examples/pdf_attachment_extractor.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import os
from typing import List, Optional

import PyPDF2

from dedoc.attachments_extractors.abstract_attachment_extractor import AbstractAttachmentsExtractor
from dedoc.data_structures import AttachedFile
from dedoc.extensions import recognized_extensions, recognized_mimes
Expand All @@ -20,21 +18,26 @@ def can_extract(self,
return extension in recognized_extensions.pdf_like_format or mime in recognized_mimes.pdf_like_format

def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]:
from pypdf import PdfReader

parameters = {} if parameters is None else parameters
handler = open(os.path.join(file_path), "rb")
reader = PyPDF2.PdfFileReader(handler)
catalog = reader.trailer["/Root"]
attachments = []
if "/Names" not in catalog or "/EmbeddedFiles" not in catalog["/Names"]:
return attachments
filenames = catalog["/Names"]["/EmbeddedFiles"]["/Names"]
for filename in filenames:
if isinstance(filename, str):
name = filename
data_index = filenames.index(filename) + 1
f_dict = filenames[data_index].getObject()
f_data = f_dict["/EF"]["/F"].getData()
attachments.append((name, f_data))
with open(os.path.join(file_path), "rb") as f:
reader = PdfReader(f)
catalog = reader.trailer["/Root"]

if "/Names" not in catalog or "/EmbeddedFiles" not in catalog["/Names"]:
return []

attachments = []
filenames = catalog["/Names"]["/EmbeddedFiles"]["/Names"]
for filename in filenames:
if isinstance(filename, str):
name = filename
data_index = filenames.index(filename) + 1
f_dict = filenames[data_index].get_object()
f_data = f_dict["/EF"]["/F"].get_data()
attachments.append((name, f_data))

attachments_dir = get_param_attachments_dir(parameters, file_path)
need_content_analysis = get_param_need_content_analysis(parameters)
attachments = self._content2attach_file(content=attachments, tmpdir=attachments_dir, need_content_analysis=need_content_analysis, parameters=parameters)
Expand Down
8 changes: 3 additions & 5 deletions docs/source/_static/code_examples/pdf_reader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional

import tabula
from PyPDF2 import PdfFileReader
from pdf_attachment_extractor import PdfAttachmentsExtractor

from dedoc.data_structures import CellWithMeta, LineMetadata
Expand Down Expand Up @@ -41,13 +40,12 @@ def __process_tables(self, path: str) -> List[Table]:
return tables

def __process_lines(self, path: str) -> List[LineWithMeta]:
from pypdf import PdfReader as PdfFileReader
with open(path, "rb") as file:
lines_with_meta = []
pdf = PdfFileReader(file)
num_pages = pdf.getNumPages()
for page_id in range(num_pages):
page = pdf.getPage(page_id)
text = page.extractText()
for page_id, page in enumerate(pdf.pages):
text = page.extract_text()
lines = text.split("\n")
for line_id, line in enumerate(lines):
metadata = LineMetadata(page_id=page_id, line_id=line_id)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/tutorials/add_new_doc_format.rst
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ You should implement the following methods:
For each line, you need to add its text, metadata, hierarchy level (if exists) and annotations (if exist).
For tables, you need to add a list of rows (each row is a list of table cells) and metadata.
You can use :ref:`dedoc_data_structures` to learn more about all the described structures.
We use `PyPDF2 <https://pypdf2.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
We use `pypdf <https://pypdf.readthedocs.io>`_ to extract the text and `tabula <https://tabula-py.readthedocs.io>`_ to extract tables.
They must be added to ``requirements.txt`` of the project.
We use class ``PdfAttachmentsExtractor`` for attachments extraction (it was mentioned before).
It must be added to the reader's constructor and used in ``read`` method.
Expand Down
3 changes: 1 addition & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,7 @@ pdfminer.six>=20211012,<=20231228
piexif==1.1.3
puremagic>=1.0,<2.0 # needs libmagic to be installed in the system
pylzma==0.5.0
pypdf>=3.17.0,<=4.1.0
PyPDF2==1.27.0
pypdf>=4.0,<6.0
pytesseract==0.3.10
python-docx==0.8.11
python-Levenshtein==0.12.2
Expand Down