1
1
from typing import List , Optional , Tuple
2
2
3
- from PyPDF2 . pdf import PageObject , PdfFileReader
3
+ from pypdf import PageObject , PdfReader
4
4
5
5
from dedoc .attachments_extractors .abstract_attachment_extractor import AbstractAttachmentsExtractor
6
6
from dedoc .data_structures .attached_file import AttachedFile
@@ -22,15 +22,15 @@ def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[Att
22
22
the methods' parameters.
23
23
"""
24
24
import os
25
- from PyPDF2 .utils import PdfReadError
26
25
from dedoc .utils .parameter_utils import get_param_attachments_dir , get_param_need_content_analysis
26
+ from pypdf .errors import PdfReadError
27
27
28
28
parameters = {} if parameters is None else parameters
29
29
filename = os .path .basename (file_path )
30
30
31
31
with open (file_path , "rb" ) as handler :
32
32
try :
33
- reader = PdfFileReader (handler )
33
+ reader = PdfReader (handler )
34
34
except Exception as e :
35
35
self .logger .warning (f"can't handle { filename } , get { e } " )
36
36
return []
@@ -55,13 +55,13 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
55
55
if "/Annots" in page .keys ():
56
56
for annot in page ["/Annots" ]:
57
57
# Other subtypes, such as /Link, cause errors
58
- subtype = annot .getObject ().get ("/Subtype" )
58
+ subtype = annot .get_object ().get ("/Subtype" )
59
59
if subtype == "/FileAttachment" :
60
- name = annot .getObject ()["/FS" ]["/UF" ]
61
- data = annot .getObject ()["/FS" ]["/EF" ]["/F" ].getData () # The file containing the stream data.
60
+ name = annot .get_object ()["/FS" ]["/UF" ]
61
+ data = annot .get_object ()["/FS" ]["/EF" ]["/F" ].get_data () # The file containing the stream data.
62
62
attachments .append ([name , data ])
63
- if subtype == "/Text" and annot .getObject ().get ("/Name" ) == "/Comment" : # it is messages (notes) in PDF
64
- note = annot .getObject ()
63
+ if subtype == "/Text" and annot .get_object ().get ("/Name" ) == "/Comment" : # it is messages (notes) in PDF
64
+ note = annot .get_object ()
65
65
created_time = convert_datetime (note ["/CreationDate" ]) if "/CreationDate" in note else None
66
66
modified_time = convert_datetime (note ["/M" ]) if "/M" in note else None
67
67
user = note .get ("/T" )
@@ -71,17 +71,15 @@ def __get_notes(self, page: PageObject) -> List[Tuple[str, bytes]]:
71
71
attachments .append ((name , bytes (content )))
72
72
return attachments
73
73
74
- def __get_page_level_attachments (self , reader : PdfFileReader ) -> List [Tuple [str , bytes ]]:
75
- cnt_page = reader .getNumPages ()
74
+ def __get_page_level_attachments (self , reader : PdfReader ) -> List [Tuple [str , bytes ]]:
76
75
attachments = []
77
- for i in range (cnt_page ):
78
- page = reader .getPage (i )
76
+ for page in reader .pages :
79
77
attachments_on_page = self .__get_notes (page )
80
78
attachments .extend (attachments_on_page )
81
79
82
80
return attachments
83
81
84
- def __get_root_attachments (self , reader : PdfFileReader ) -> List [Tuple [str , bytes ]]:
82
+ def __get_root_attachments (self , reader : PdfReader ) -> List [Tuple [str , bytes ]]:
85
83
"""
86
84
Retrieves the file attachments of the PDF as a dictionary of file names and the file data as a bytestring.
87
85
@@ -96,9 +94,9 @@ def __get_root_attachments(self, reader: PdfFileReader) -> List[Tuple[str, bytes
96
94
for f in file_names :
97
95
if isinstance (f , str ):
98
96
data_index = file_names .index (f ) + 1
99
- dict_object = file_names [data_index ].getObject ()
97
+ dict_object = file_names [data_index ].get_object ()
100
98
if "/EF" in dict_object and "/F" in dict_object ["/EF" ]:
101
- data = dict_object ["/EF" ]["/F" ].getData ()
99
+ data = dict_object ["/EF" ]["/F" ].get_data ()
102
100
name = dict_object .get ("/UF" , f"pdf_attach_{ uuid .uuid4 ()} " )
103
101
attachments .append ((name , data ))
104
102
0 commit comments