-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathscrap_organization_projects.py
41 lines (33 loc) · 1.73 KB
/
scrap_organization_projects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import re
import requests
import csv
from bs4 import BeautifulSoup
year, organizations_url, organizations_html = "", "", ""
with open('GSoC_archives.csv', 'r') as csv_read_file:
csv_reader = csv.reader(csv_read_file)
next(csv_reader) # Skip the header
row = next(csv_reader) # Read last year's GSoC organizations url
year, organizations_url, organizations_html = row[0], row[1], requests.get(
row[1]).text
# construct GSoC archive url: protocol + subdomain + domain
organizations_url = organizations_url.split('/')
gsoc_archive_url = organizations_url[0] + "//" + organizations_url[2]
soup = BeautifulSoup(organizations_html, 'html.parser')
names = soup.find_all(class_="organization-card__link")
organizations = {}
for name in names:
organizations[name.h4.text] = gsoc_archive_url + name["href"]
#organizations = {organization_name: organization_url}
for org_name, org_url in organizations.items():
print("parsing " + org_name)
soup = BeautifulSoup(requests.get(org_url).text, 'html.parser')
names = soup.find_all(class_="archive-project-card")
organizations[org_name] = [org_url, len(names)]
#organizations = {organization_name: [organization_url, organization_project_count]}
with open('organization_projects.csv', 'w', newline='') as csv_write_file:
csv_writer = csv.writer(csv_write_file)
csv_writer.writerow(['Organization', 'URL', 'Number of Projects in ' + year])
# csv_writer.writerow(['Organization', 'Number of Projects in ' + year])
for org_name, [org_url, project_count] in sorted(organizations.items(), key=lambda x: (-x[1][1], x[0])):
csv_writer.writerow([org_name, org_url, project_count])
# csv_writer.writerow(["[" + org_name + "](" + org_url + ")", project_count])