-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticle_detail_functions.py
101 lines (87 loc) · 4.37 KB
/
article_detail_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# Modules
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
def getDateMetadataForLink(link):
year = int(link.split(".com/",1)[1].split("/",1)[0])
month = int(link.split(".com/",1)[1].split("/",1)[1].split("/",1)[0])
day = int(link.split(".com/",1)[1].split("/",1)[1].split("/",1)[1].split("/",1)[0])
date = datetime(year, month, day)
return date
# getLinksForArticles - Code Plan
# Iterate through pages of the 'Saturday 7-up' tag
# e.g: 'https://order-order.com/tag/saturday-seven-up/page/80/'
# For each page,
# first check if title != "Page not found - Guido Fawkes"
# if so stop
# else, get all the "a" tags that have the class "link--title"
# Store all links in an array, ready for parsing into the dataframe.
def getLinksForArticles():
print("We are starting to gather Article links.")
# Website for episodes
TAG_URL = "https://order-order.com/tag/saturday-seven-up/page/"
valid_url = False
current_page = 1
page_links= []
while valid_url is not True:
page = requests.get(TAG_URL+str(current_page)+"/")
soup = BeautifulSoup(page.content, "html.parser")
if soup.title.string != "Page not found – Guido Fawkes":
links = soup.find_all("a", {"class": "link--title"})
# original_size = len(page_links)
for i in range(0, len(links)):
if links[i]['href'] not in page_links:
page_links.append(links[i]['href'])
# print("In page: " + str(current_page) + ", we found: " + str(len(page_links)-original_size) + " articles.")
current_page += 1
else:
valid_url = True
print("Overall, we found " + str(len(page_links)) + " articles")
return page_links
trimmed_matches = ["visitors", "pages"]
matches = ["visitors", "pages", "visits", "pageviews"]
def getContentFromLink(link, current, total):
if ((current % int(total*0.1)) == 0):
print("We are on article " + str(current+1) + " out of " + str(total+1))
print("Currently extracting: " + str(link))
page = requests.get(link)
soup = BeautifulSoup(page.content, "html.parser")
current_title = soup.find("v-card-title", {"class": "red accent-4 white--text d-block"}).text.lstrip().rstrip()
article_details = ""
find_by_p1_class = soup.find("p", {"class": "p1"})
if find_by_p1_class is not None:
article_details = find_by_p1_class.text.replace(u'\xa0', u' ')
else:
p_locator = soup.find_all("p")
for i in range(0, len(p_locator)):
if p_locator[i].text == "The top stories last week in order of popularity were:":
if any(x in p_locator[i-1].text.replace(u'\xa0', u' ') for x in matches):
article_details = p_locator[i-1].text.replace(u'\xa0', u' ')
break
else:
for j in range(0, len(p_locator)):
if all(x in p_locator[j].text.replace(u'\xa0', u' ') for x in trimmed_matches):
article_details = p_locator[j].text.replace(u'\xa0', u' ')
break
elif "The top stories" in p_locator[i].text:
article_details = p_locator[i].text.replace(u'\xa0', u' ')
break
elif "most popular stories" in p_locator[i].text:
article_details = p_locator[i].text.replace(u'\xa0', u' ')
break
elif "most read and shared stories" in p_locator[i].text:
article_details = p_locator[i].text.replace(u'\xa0', u' ')
break
elif "best and most read stories" in p_locator[i].text:
article_details = p_locator[i].text.replace(u'\xa0', u' ')
break
elif "top 7 stories" in p_locator[i].text:
article_details = p_locator[i].text.replace(u'\xa0', u' ')
break
post_time = soup.find("span", {"class": "posted-on blue-grey--text text--darken-4"}).text.split("@", 1)[1].lstrip().rstrip()
list_links_raw = soup.find_all("li")
list_links = []
for i in range(0, len(list_links_raw)):
list_links.append(re.sub('\s+', ' ', list_links_raw[i].text.replace(u'\xa0', u' ')))
return [current_title, article_details, post_time, list_links]