Skip to content

Commit

Permalink
Update bad usage of if statements. Update comments. Update prints tha…
Browse files Browse the repository at this point in the history
…t didn't match format. (#160)
  • Loading branch information
CaptainStabs authored Dec 7, 2021
1 parent dd92b53 commit 5956e9e
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 25 deletions.
4 changes: 2 additions & 2 deletions common/base_scrapers/crimegraphics/crimegraphics_arrest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@

# this function is used for gathering time stats
def function_timer(stats):
if stats != False:
if stats:
return time.perf_counter()


# this function simply calculates and prints the difference between the end and start times
def time_dif(stats, string, start, end):
if stats != False:
if stats:
print(f"{string}: {end - start} seconds")


Expand Down
4 changes: 2 additions & 2 deletions common/base_scrapers/crimegraphics/crimegraphics_bulletin.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@

# this function is used for gathering time stats
def function_timer(stats):
if stats != False:
if stats:
return time.perf_counter()


# this function simply calculates and prints the difference between the end and start times
def time_dif(stats, string, start, end):
if stats != False:
if stats:
print(f"{string}: {end - start} seconds")


Expand Down
4 changes: 2 additions & 2 deletions common/base_scrapers/crimegraphics/crimegraphics_clery.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@

# this function is used for gathering time stats
def function_timer(stats):
if stats != False:
if stats:
return time.perf_counter()


# this function simply calculates and prints the difference between the end and start times
def time_dif(stats, string, start, end):
if stats != False:
if stats:
print(f"{string}: {end - start} seconds")


Expand Down
53 changes: 36 additions & 17 deletions common/utils/file_downloaders/downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def file_compare(save_dir, file_1, file_2, try_overwite=False, no_overwrite=Fals
else:
# I tried to just put the code to write it here, but it would've required too many arguments
print("File has changed")
if try_overwite == True:
if try_overwite:
os.remove(file_1)
# Renames the new file to the old_file's name (without the new_)
os.rename(file_2, file_1)
Expand Down Expand Up @@ -64,18 +64,31 @@ def check_if_exists(save_dir, file_name, add_date):
else:
return False


# These can likely get merged into a single function
def get_pdf(
save_dir, file_name, url_2, sleep_time, debug=False, try_overwite=False, no_overwrite=False, add_date=False,
):
"""
Download PDFs
:param save_dir: path where files should be saved, string
:param file_name: name of file, string
:param name_in_url: url of file, string
:param extract_name: time to sleep between requests, integer
:param debug: more verbose printing, should be replaced with logging module, bool
:param try_overwite: mostly deprecated. ask before using
:param no_overwrite: replaces try_overwrite. Use with add_date for best results. Prevent overwriting of data files. (default false)
:param add_date: adds the date scraped to the filename, bool
"""
file_name = file_name.lstrip("/")
print(file_name)

if add_date is True:
if add_date:
print(" [?] add_date is True")

if not os.path.isfile("last_run.txt"):
print(" [!] last_run.txt did not exist... Is this your first time running?")
print(" [*] Creating last_run.txt and adding data...")

with open("last_run.txt", "w") as last_run:
date_name = str(date.today()).replace("-", "_")
print(date_name)
Expand All @@ -87,11 +100,12 @@ def get_pdf(

# Default run mode, simply checks that the file does not already exists.
# Don't need to check if
if os.path.exists(save_dir + file_name) == False and check_if_exists(save_dir, file_name, add_date=add_date) == False:
if not os.path.exists(save_dir + file_name) and check_if_exists(save_dir, file_name, add_date=add_date) is False:
print(" [*] File does not exist")
try:
print(" [*] Requesting file....")
pdf = urllib.request.urlopen(url_2.replace(" ", "%20"))

except urllib.error.HTTPError as exception:
print(f" [!] {exception}")
print(" [!] URL: " + str(url_2))
Expand All @@ -100,7 +114,7 @@ def get_pdf(
traceback.print_exc()
sys.exit()

if add_date == True:
if add_date:
print(" [?] add_date is True")
date_name = date.today()
file_name = file_name.strip(".pdf") + "_" + str(date_name).replace("-", "_") + ".pdf"
Expand All @@ -116,9 +130,9 @@ def get_pdf(

# If the file exists, and no_overwrite is true, then:
elif (
os.path.exists(save_dir + file_name) == True
and check_if_exists(save_dir, file_name, add_date=add_date) == False
and no_overwrite == True
os.path.exists(save_dir + file_name) is True
and check_if_exists(save_dir, file_name, add_date=add_date) is False
and no_overwrite is True
):
# Tries to get the file and set it to pdf
try:
Expand All @@ -131,7 +145,7 @@ def get_pdf(
if debug:
traceback.print_exc()
sys.exit()
print("Comparing")
print(" [*] Comparing")

# Saves the pdf while prepending with "new_"
print(" [*] Saving as new_" + file_name)
Expand All @@ -141,7 +155,8 @@ def get_pdf(
new_filename = "new_" + file_name

print(" [*] Comparing...")
if file_compare(save_dir, file_name, new_filename, no_overwrite=True) == False:

if not file_compare(save_dir, file_name, new_filename, no_overwrite=True):
print(" [?] Files are different")
date_name = date.today()
# print(date_name)
Expand All @@ -152,7 +167,7 @@ def get_pdf(
file.write(pdf.read())
file.close()
# Checks if the files exists, and that `try_overwite` is True
elif os.path.exists(save_dir + file_name) == True and try_overwite == True:
elif os.path.exists(save_dir + file_name) is True and try_overwite is True:
print(" [!!!] try_overwite is set to True, verify that you want this before continuing")
# Tries to get the file and set it to pdf
try:
Expand All @@ -166,7 +181,7 @@ def get_pdf(
sys.exit()
print("Comparing")

if add_date == True:
if add_date:
date_name = date.today()
file_name = file_name.strip(".pdf") + "_" + str(date_name).replace("-", "_") + ".pdf"
print(" [*] Date appended name: " + file_name)
Expand All @@ -186,9 +201,9 @@ def get_xls(save_dir, file_name, url_2, sleep_time, debug=False):
if ".xls" not in file_name:
# Allows saving as xls even if it's not in the file_name (saves in proper format)
file_name = file_name + ".xls"
if os.path.exists(save_dir + file_name) == False:
if not os.path.exists(save_dir + file_name):
try:
print(" [*] Requesting file...")
print(" [*] Requesting file...")
pdf = urllib.request.urlopen(url_2.replace(" ", "%20"))
except urllib.error.HTTPError as exception:
print(f" [!] {exception} ")
Expand All @@ -197,18 +212,22 @@ def get_xls(save_dir, file_name, url_2, sleep_time, debug=False):
if debug:
traceback.print_exc()
exit()

with open(save_dir + file_name, "wb") as file:
file.write(pdf.read())

file.close()
time.sleep(sleep_time)
print("Sleep")
print(" [*] Sleeping for: " + str(sleep_time))


def get_doc(save_dir, file_name, url_2, sleep_time):
if os.path.exists(save_dir + file_name) == False:
if not os.path.exists(save_dir + file_name):
document = requests.get(url_2.replace(" ", "%20", allow_redirects=True))

with open(file_name, "w") as data_file:
data_file.write(document.text) # Writes using requests text function thing

data_file.close()
time.sleep(sleep_time)
print("Sleep")
print(" [*] Sleeping for: " + str(sleep_time))
4 changes: 3 additions & 1 deletion common/utils/list_pdf_utils/extract_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ def extract_info(soup, configs, extract_name=False, name_in_url=True, configs_fi

url = str(link["href"])
print(url)
if extract_name == False:

if not extract_name:
# print(" [?] extract_name is False")
name = url[url.rindex("/") :]

else:
name = link.string
# print(" [?] extract_name is True")
Expand Down
2 changes: 1 addition & 1 deletion common/utils/list_pdf_utils/get_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,5 +161,5 @@ def get_files(
input_file.close()

# Used for debugging
if delete is not False:
if delete:
os.remove("url_name.txt")

0 comments on commit 5956e9e

Please sign in to comment.