Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update common utils #160

Merged
merged 1 commit into from
Dec 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions common/base_scrapers/crimegraphics/crimegraphics_arrest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@

# this function is used for gathering time stats
def function_timer(stats):
if stats != False:
if stats:
return time.perf_counter()


# this function simply calculates and prints the difference between the end and start times
def time_dif(stats, string, start, end):
if stats != False:
if stats:
print(f"{string}: {end - start} seconds")


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@

# this function is used for gathering time stats
def function_timer(stats):
if stats != False:
if stats:
return time.perf_counter()


# this function simply calculates and prints the difference between the end and start times
def time_dif(stats, string, start, end):
if stats != False:
if stats:
print(f"{string}: {end - start} seconds")


Expand Down
4 changes: 2 additions & 2 deletions common/base_scrapers/crimegraphics/crimegraphics_clery.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@

# this function is used for gathering time stats
def function_timer(stats):
if stats != False:
if stats:
return time.perf_counter()


# this function simply calculates and prints the difference between the end and start times
def time_dif(stats, string, start, end):
if stats != False:
if stats:
print(f"{string}: {end - start} seconds")


Expand Down
53 changes: 36 additions & 17 deletions common/utils/file_downloaders/downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def file_compare(save_dir, file_1, file_2, try_overwite=False, no_overwrite=Fals
else:
# I tried to just put the code to write it here, but it would've required too many arguments
print("File has changed")
if try_overwite == True:
if try_overwite:
os.remove(file_1)
# Renames the new file to the old_file's name (without the new_)
os.rename(file_2, file_1)
Expand Down Expand Up @@ -64,18 +64,31 @@ def check_if_exists(save_dir, file_name, add_date):
else:
return False


# These can likely get merged into a single function
def get_pdf(
save_dir, file_name, url_2, sleep_time, debug=False, try_overwite=False, no_overwrite=False, add_date=False,
):
"""
Download PDFs
:param save_dir: path where files should be saved, string
:param file_name: name of file, string
:param name_in_url: url of file, string
:param extract_name: time to sleep between requests, integer
:param debug: more verbose printing, should be replaced with logging module, bool
:param try_overwite: mostly deprecated. ask before using
:param no_overwrite: replaces try_overwrite. Use with add_date for best results. Prevent overwriting of data files. (default false)
:param add_date: adds the date scraped to the filename, bool
"""
file_name = file_name.lstrip("/")
print(file_name)

if add_date is True:
if add_date:
print(" [?] add_date is True")

if not os.path.isfile("last_run.txt"):
print(" [!] last_run.txt did not exist... Is this your first time running?")
print(" [*] Creating last_run.txt and adding data...")

with open("last_run.txt", "w") as last_run:
date_name = str(date.today()).replace("-", "_")
print(date_name)
Expand All @@ -87,11 +100,12 @@ def get_pdf(

# Default run mode, simply checks that the file does not already exists.
# Don't need to check if
if os.path.exists(save_dir + file_name) == False and check_if_exists(save_dir, file_name, add_date=add_date) == False:
if not os.path.exists(save_dir + file_name) and check_if_exists(save_dir, file_name, add_date=add_date) is False:
print(" [*] File does not exist")
try:
print(" [*] Requesting file....")
pdf = urllib.request.urlopen(url_2.replace(" ", "%20"))

except urllib.error.HTTPError as exception:
print(f" [!] {exception}")
print(" [!] URL: " + str(url_2))
Expand All @@ -100,7 +114,7 @@ def get_pdf(
traceback.print_exc()
sys.exit()

if add_date == True:
if add_date:
print(" [?] add_date is True")
date_name = date.today()
file_name = file_name.strip(".pdf") + "_" + str(date_name).replace("-", "_") + ".pdf"
Expand All @@ -116,9 +130,9 @@ def get_pdf(

# If the file exists, and no_overwrite is true, then:
elif (
os.path.exists(save_dir + file_name) == True
and check_if_exists(save_dir, file_name, add_date=add_date) == False
and no_overwrite == True
os.path.exists(save_dir + file_name) is True
and check_if_exists(save_dir, file_name, add_date=add_date) is False
and no_overwrite is True
):
# Tries to get the file and set it to pdf
try:
Expand All @@ -131,7 +145,7 @@ def get_pdf(
if debug:
traceback.print_exc()
sys.exit()
print("Comparing")
print(" [*] Comparing")

# Saves the pdf while prepending with "new_"
print(" [*] Saving as new_" + file_name)
Expand All @@ -141,7 +155,8 @@ def get_pdf(
new_filename = "new_" + file_name

print(" [*] Comparing...")
if file_compare(save_dir, file_name, new_filename, no_overwrite=True) == False:

if not file_compare(save_dir, file_name, new_filename, no_overwrite=True):
print(" [?] Files are different")
date_name = date.today()
# print(date_name)
Expand All @@ -152,7 +167,7 @@ def get_pdf(
file.write(pdf.read())
file.close()
# Checks if the files exists, and that `try_overwite` is True
elif os.path.exists(save_dir + file_name) == True and try_overwite == True:
elif os.path.exists(save_dir + file_name) is True and try_overwite is True:
print(" [!!!] try_overwite is set to True, verify that you want this before continuing")
# Tries to get the file and set it to pdf
try:
Expand All @@ -166,7 +181,7 @@ def get_pdf(
sys.exit()
print("Comparing")

if add_date == True:
if add_date:
date_name = date.today()
file_name = file_name.strip(".pdf") + "_" + str(date_name).replace("-", "_") + ".pdf"
print(" [*] Date appended name: " + file_name)
Expand All @@ -186,9 +201,9 @@ def get_xls(save_dir, file_name, url_2, sleep_time, debug=False):
if ".xls" not in file_name:
# Allows saving as xls even if it's not in the file_name (saves in proper format)
file_name = file_name + ".xls"
if os.path.exists(save_dir + file_name) == False:
if not os.path.exists(save_dir + file_name):
try:
print(" [*] Requesting file...")
print(" [*] Requesting file...")
pdf = urllib.request.urlopen(url_2.replace(" ", "%20"))
except urllib.error.HTTPError as exception:
print(f" [!] {exception} ")
Expand All @@ -197,18 +212,22 @@ def get_xls(save_dir, file_name, url_2, sleep_time, debug=False):
if debug:
traceback.print_exc()
exit()

with open(save_dir + file_name, "wb") as file:
file.write(pdf.read())

file.close()
time.sleep(sleep_time)
print("Sleep")
print(" [*] Sleeping for: " + str(sleep_time))


def get_doc(save_dir, file_name, url_2, sleep_time):
if os.path.exists(save_dir + file_name) == False:
if not os.path.exists(save_dir + file_name):
document = requests.get(url_2.replace(" ", "%20", allow_redirects=True))

with open(file_name, "w") as data_file:
data_file.write(document.text) # Writes using requests text function thing

data_file.close()
time.sleep(sleep_time)
print("Sleep")
print(" [*] Sleeping for: " + str(sleep_time))
4 changes: 3 additions & 1 deletion common/utils/list_pdf_utils/extract_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ def extract_info(soup, configs, extract_name=False, name_in_url=True, configs_fi

url = str(link["href"])
print(url)
if extract_name == False:

if not extract_name:
# print(" [?] extract_name is False")
name = url[url.rindex("/") :]

else:
name = link.string
# print(" [?] extract_name is True")
Expand Down
2 changes: 1 addition & 1 deletion common/utils/list_pdf_utils/get_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,5 +161,5 @@ def get_files(
input_file.close()

# Used for debugging
if delete is not False:
if delete:
os.remove("url_name.txt")