-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathapp.py
91 lines (76 loc) · 3.6 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import re
import time
import datetime
import pandas as pd
from PIL import Image
import streamlit as st
from app_funcs import *
st.set_page_config(
page_title="Scanned PDFs Checker",
page_icon="📄",
layout="centered",
initial_sidebar_state="auto",
)
main_image = Image.open('static/main_banner.png')
input_path = "input/"
output_path = "output/"
zip_path = "compressed/"
total_docs = 0
scanned_docs_df = pd.DataFrame(columns = ['File Name'])
digital_docs_df = pd.DataFrame(columns = ['File Name'])
clean_directory(input_path)
clean_directory(output_path)
clean_directory(zip_path)
st.image(main_image,use_column_width='auto')
st.title("📑📝 Scanned PDFs checker 📄👨💻")
st.info('✨ Checks for the number of scanned/digitally created PDFs from a corpus of PDF documents.😉')
st.info('☢ The app\'s execution time may vary depending on the size/number of the uploaded PDFs.')
uploaded_files = st.file_uploader("Upload PDFs 🚀", type=["pdf"], accept_multiple_files=True)
with st.spinner(f"Working... 💫"):
if uploaded_files:
for uploaded_file in uploaded_files:
with open(os.path.join(input_path,uploaded_file.name),"wb") as f:
f.write((uploaded_file).getbuffer())
for file_name in os.listdir(input_path):
print("-"*40)
print("Checking: ", file_name)
start_time = time.time()
output_file_name = "OCRed_"+file_name
output = sp.getoutput(f"ocrmypdf {input_path}{file_name} {output_path}{output_file_name}")
if not re.search("PriorOcrFoundError: page already has text!",output):
print("--- Uploaded scanned PDF ---")
scanned_docs_df = scanned_docs_df.append({'File Name' : file_name},ignore_index = True)
else:
print("---Uploaded digital PDF ---")
digital_docs_df = digital_docs_df.append({'File Name' : file_name},ignore_index = True)
print("Processing complete..")
print("Time Taken: ", round(time.time() - start_time, 2), " seconds")
total_docs+=1
col1, col2 = st.columns(2)
col1.metric("# Scanned PDFs", len(scanned_docs_df))
col2.metric("# Digital PDFs", len(digital_docs_df))
with col1:
st.markdown("<br>", unsafe_allow_html=True)
st.write("List of Scanned PDFs 📝")
st.dataframe(scanned_docs_df)
with col2:
st.markdown("<br>", unsafe_allow_html=True)
st.write("List of Digital PDFs 📝")
st.dataframe(digital_docs_df)
if len(scanned_docs_df) > 0:
check = st.checkbox("Do you want me to perform OCR for the scanned PDFs? 🤔")
if check:
print("Generating Zip...")
compress()
with open(zip_path + 'OCR_PDFs.zip', "rb") as file:
if st.download_button(
label="Download Zip file of OCRed PDFs 📑",
data=file,
file_name='OCR_PDFs.zip',
mime='application/zip'
):
download_success()
else:
st.warning('⚠ Please upload your corpus of PDFs! 😯')
st.markdown("<br><hr><center>Made with ❤️ by <a href='mailto:ralhanprateek@gmail.com?subject=Scanned PDFs Checker WebApp!&body=Please specify the issue you are facing with the app.'><strong>Prateek Ralhan</strong></a>✨</center><hr>", unsafe_allow_html=True)
st.markdown("<style> footer {visibility: hidden;} </style>", unsafe_allow_html=True)