-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
86 lines (64 loc) · 2.17 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
from urllib.request import urlopen, Request
import random
cc_students = []
cc_names = []
cc_links = []
cc_talks = []
cc_pages = []
cc_images = []
cc_titanic = []
outcomes = ["lol, no...", "Rose pushed you under..", "Jack sacraficed himself for you!", "Damn right- you swam to NY!", "Who cares!?", "A shark ate you..", "Well, you're here aren't you!?"]
for i in range(0, 27):
survived = random.choice(outcomes)
cc_titanic.append(survived)
print(cc_titanic)
for i in range(4, 8):
cc_pages.append(f'https://codechrysalis.io/cc{i}')
print(cc_pages)
for page in cc_pages:
source = requests.get(page).text
soup = BeautifulSoup(source, 'lxml')
# images = soup.find_all('img', {'src':re.compile('.jpg')})
for student in soup.find_all(class_='student-graduate'):
cc_students.append(student)
for image in soup.find_all('img', {'src':re.compile('.jpg')}):
cc_images.append(image['src'])
# counter = 0
# for img in soup.find_all('img'):
# with open("image" + str(counter),'wb') as f:
# f.write(urlopen(img['src']).read())
# counter += 1
for name in soup.find_all(class_='student-graduate__name'):
cc_names.append(name.text)
for link in soup.find_all(class_="student-profile__github-link"):
cc_links.append(link["href"])
for talk in soup.find_all(class_="student-profile__deploy-link"):
cc_talks.append(talk["href"])
git_links = []
in_links = []
cv_links = []
print(len(cc_images))
for i in range(0, 81, 3):
git_links.append(cc_links[i])
for i in range(1, 81, 3):
in_links.append(cc_links[i])
for i in range(2, 81, 3):
cv_links.append(cc_links[i])
data = {'Name': cc_names,
'Github': git_links,
'Linkedin': in_links,
'CV': cv_links,
'Titanic?': cc_titanic
}
df = pd.DataFrame(data)
df.to_csv("/Users/admin/Desktop/py-course/titanic/cc-tables.csv")
# with open('simple.html') as html_file:
# with open('https://codechrysalis.io/cc4') as html_file:
# ccstudents = BeautifulSoup(html_file, 'lxml')
# name = ccstudents.find(class_='student-graduate__name')
# print(name.text)
# yolo = soup.find_all('h3', class_='heading')