Skip to content

Commit e52e9b5

Browse files
authored
Create leadersleague_scraper.py
1 parent 5bd3405 commit e52e9b5

File tree

1 file changed

+391
-0
lines changed

1 file changed

+391
-0
lines changed

leadersleague_scraper.py

+391
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,391 @@
1+
import requests
2+
import csv
3+
import os
4+
5+
6+
countries = {
7+
"BR": "bresil",
8+
"PE": "perou",
9+
"CO": "colombia",
10+
"IT": "italie",
11+
"ES": "espagne",
12+
"DE": "allemagne",
13+
"CH": "suisse",
14+
"CL": "chile",
15+
"BE": "belgique",
16+
"LU": "luxembourg",
17+
"PT": "portugal",
18+
"MX": "mexico",
19+
"FR": "france",
20+
"EC": "ecuador",
21+
"AR": "argentina",
22+
"AC": None,
23+
"AF": None,
24+
"CA": "canada",
25+
"BO": "bolivia",
26+
"UY": "uruguay",
27+
"GB": "royaume-uni",
28+
"US": "etats-unis",
29+
"CN": "chine"
30+
}
31+
32+
33+
def saveData(dataset):
34+
with open('leads.csv', mode='a+', encoding='utf-8-sig', newline='') as csvFile:
35+
fieldnames = [
36+
"Email", "First Name", "Last Name", "Full Name", "Job Position", "Company Name", "Company URL", "Region", "Category", "LeadersLeague Link"
37+
]
38+
writer = csv.DictWriter(csvFile, fieldnames=fieldnames,
39+
delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
40+
if os.stat('leads.csv').st_size == 0:
41+
writer.writeheader()
42+
writer.writerow({
43+
"Email": dataset[0],
44+
"First Name": dataset[1],
45+
"Last Name": dataset[2],
46+
"Full Name": dataset[3],
47+
"Job Position": dataset[4],
48+
"Company Name": dataset[5],
49+
"Company URL": dataset[6],
50+
"Region": dataset[7],
51+
"Category": dataset[8],
52+
"LeadersLeague Link": dataset[9]
53+
})
54+
55+
56+
def getCategory(subtopic):
57+
link = f'https://api.leadersleague.com/subtopic/{subtopic}'
58+
headers = {
59+
'accept': '*/*',
60+
'accept-encoding': 'gzip, deflate, br, zstd',
61+
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
62+
'cache-control': 'max-age=0',
63+
'dnt': '1',
64+
'priority': 'u=0, i',
65+
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
66+
'sec-ch-ua-mobile': '?0',
67+
'sec-ch-ua-platform': '"Windows"',
68+
'sec-fetch-dest': 'document',
69+
'sec-fetch-mode': 'navigate',
70+
'sec-fetch-site': 'none',
71+
'sec-fetch-user': '?1',
72+
'upgrade-insecure-requests': '1',
73+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
74+
}
75+
try:
76+
resp = requests.get(link, headers=headers).json()
77+
except:
78+
print("Failed to open {}".format(link))
79+
return ""
80+
contents = resp.get('contents', [])
81+
for content in contents:
82+
if content.get('lang', '') == 'fr_FR':
83+
return content.get('title', '')
84+
return ""
85+
86+
87+
def getState(state):
88+
link = f'https://api.leadersleague.com/state/{state}'
89+
headers = {
90+
'accept': '*/*',
91+
'accept-encoding': 'gzip, deflate, br, zstd',
92+
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
93+
'cache-control': 'max-age=0',
94+
'dnt': '1',
95+
'priority': 'u=0, i',
96+
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
97+
'sec-ch-ua-mobile': '?0',
98+
'sec-ch-ua-platform': '"Windows"',
99+
'sec-fetch-dest': 'document',
100+
'sec-fetch-mode': 'navigate',
101+
'sec-fetch-site': 'none',
102+
'sec-fetch-user': '?1',
103+
'upgrade-insecure-requests': '1',
104+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
105+
}
106+
try:
107+
resp = requests.get(link, headers=headers).json()
108+
except:
109+
print("Failed to open {}".format(link))
110+
return ""
111+
contents = resp.get('contents', [])
112+
for content in contents:
113+
if content.get('lang', '') == 'fr_FR':
114+
return content.get('name', '')
115+
return ""
116+
117+
118+
def getProfession(profession):
119+
link = f'https://api.leadersleague.com/profession/{profession}'
120+
headers = {
121+
'accept': '*/*',
122+
'accept-encoding': 'gzip, deflate, br, zstd',
123+
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
124+
'cache-control': 'max-age=0',
125+
'dnt': '1',
126+
'priority': 'u=0, i',
127+
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
128+
'sec-ch-ua-mobile': '?0',
129+
'sec-ch-ua-platform': '"Windows"',
130+
'sec-fetch-dest': 'document',
131+
'sec-fetch-mode': 'navigate',
132+
'sec-fetch-site': 'none',
133+
'sec-fetch-user': '?1',
134+
'upgrade-insecure-requests': '1',
135+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
136+
}
137+
try:
138+
resp = requests.get(link, headers=headers).json()
139+
except:
140+
print("Failed to open {}".format(link))
141+
return ""
142+
contents = resp.get('contents', [])
143+
for content in contents:
144+
if content.get('lang', '') == 'fr_FR':
145+
return content.get('title', '')
146+
return ""
147+
148+
149+
def getPersonSingleDetail(person_slug, company_name, company_url, parent_link, region, category, job_only=False):
150+
link = f'https://api.leadersleague.com/people/{person_slug}/aggregate'
151+
headers = {
152+
'accept': '*/*',
153+
'accept-encoding': 'gzip, deflate, br, zstd',
154+
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
155+
'cache-control': 'max-age=0',
156+
'dnt': '1',
157+
'priority': 'u=0, i',
158+
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
159+
'sec-ch-ua-mobile': '?0',
160+
'sec-ch-ua-platform': '"Windows"',
161+
'sec-fetch-dest': 'document',
162+
'sec-fetch-mode': 'navigate',
163+
'sec-fetch-site': 'none',
164+
'sec-fetch-user': '?1',
165+
'upgrade-insecure-requests': '1',
166+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
167+
}
168+
try:
169+
people = requests.get(link, headers=headers).json()
170+
except:
171+
print("Failed to open {}".format(link))
172+
return
173+
first_name = people.get('firstname', '')
174+
last_name = people.get('lastname', '')
175+
full_name = first_name + ' ' + last_name
176+
try:
177+
job_position = people.get('jobs').get('job')
178+
except:
179+
job_position = ''
180+
if job_only:
181+
return job_position
182+
email = people.get('email', '')
183+
profile_link = "https://www.leadersleague.com/fr/peoples/" + \
184+
people.get('slug')
185+
print("Person Full Name: {}".format(full_name))
186+
print("Person Profile Link: {}".format(profile_link))
187+
print("Person Job Position: {}".format(job_position))
188+
print("Person Email: {}".format(email))
189+
print("Company Name: {}".format(company_name))
190+
print("Company URL: {}".format(company_url))
191+
print("Region: {}".format(region))
192+
dataset = [email, first_name, last_name, full_name,
193+
job_position, company_name, company_url, region, category, parent_link]
194+
saveData(dataset)
195+
196+
197+
def getPersonList(company_uuid, company_slug, company_name, company_url, parent_link, region, category):
198+
link = f"https://api.leadersleague.com/company-page/{company_slug}/aggregate?page=1&perpage=500&companyUuid={
199+
company_uuid}&country=france-1&expertise=gestion-de-patrimoine"
200+
headers = {
201+
'accept': '*/*',
202+
'accept-encoding': 'gzip, deflate, br, zstd',
203+
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
204+
'cache-control': 'max-age=0',
205+
'dnt': '1',
206+
'priority': 'u=0, i',
207+
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
208+
'sec-ch-ua-mobile': '?0',
209+
'sec-ch-ua-platform': '"Windows"',
210+
'sec-fetch-dest': 'document',
211+
'sec-fetch-mode': 'navigate',
212+
'sec-fetch-site': 'none',
213+
'sec-fetch-user': '?1',
214+
'upgrade-insecure-requests': '1',
215+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
216+
}
217+
try:
218+
resp = requests.get(link, headers=headers).json()
219+
except:
220+
print("Failed to open {}".format(link))
221+
return []
222+
for people in resp.get('peoples', []):
223+
first_name = people.get('firstName', '')
224+
last_name = people.get('lastName', '')
225+
full_name = first_name + ' ' + last_name
226+
job_position = getPersonSingleDetail(people.get(
227+
'slug'), company_name, company_url, parent_link, region, category, job_only=True)
228+
email = people.get('email', '')
229+
profile_link = "https://www.leadersleague.com/fr/peoples/" + \
230+
people.get('slug')
231+
print("Person Full Name: {}".format(full_name))
232+
print("Person Profile Link: {}".format(profile_link))
233+
print("Person Job Position: {}".format(job_position))
234+
print("Person Email: {}".format(email))
235+
print("Company Name: {}".format(company_name))
236+
print("Company URL: {}".format(company_url))
237+
print("Region: {}".format(region))
238+
print("Company UUID: {}".format(company_uuid))
239+
dataset = [email, first_name, last_name, full_name,
240+
job_position, company_name, company_url, region, category, parent_link]
241+
saveData(dataset)
242+
243+
244+
def getCompanyWebsite(company_uuid):
245+
link = f'https://api.leadersleague.com/firm/{company_uuid}/aggregate'
246+
headers = {
247+
'accept': '*/*',
248+
'accept-encoding': 'gzip, deflate, br, zstd',
249+
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
250+
'cache-control': 'max-age=0',
251+
'dnt': '1',
252+
'priority': 'u=0, i',
253+
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
254+
'sec-ch-ua-mobile': '?0',
255+
'sec-ch-ua-platform': '"Windows"',
256+
'sec-fetch-dest': 'document',
257+
'sec-fetch-mode': 'navigate',
258+
'sec-fetch-site': 'none',
259+
'sec-fetch-user': '?1',
260+
'upgrade-insecure-requests': '1',
261+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
262+
}
263+
try:
264+
resp = requests.get(link, headers=headers).json()
265+
except:
266+
print("Failed to open {}".format(link))
267+
return None
268+
return resp.get('websiteUrl')
269+
270+
271+
def listCompanies(link, parent_link):
272+
headers = {
273+
'accept': '*/*',
274+
'accept-encoding': 'gzip, deflate, br, zstd',
275+
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
276+
'cache-control': 'max-age=0',
277+
'dnt': '1',
278+
'priority': 'u=0, i',
279+
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
280+
'sec-ch-ua-mobile': '?0',
281+
'sec-ch-ua-platform': '"Windows"',
282+
'sec-fetch-dest': 'document',
283+
'sec-fetch-mode': 'navigate',
284+
'sec-fetch-site': 'none',
285+
'sec-fetch-user': '?1',
286+
'upgrade-insecure-requests': '1',
287+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
288+
}
289+
try:
290+
resp = requests.get(link, headers=headers).json()
291+
except:
292+
print("Failed to open {}".format(link))
293+
return
294+
state = resp.get('states')
295+
if state:
296+
state = getState(state)
297+
if not state:
298+
try:
299+
region = resp.get('countries')
300+
region_matched = countries.get(region, region)
301+
if region_matched is None:
302+
region = region.title()
303+
else:
304+
region = region_matched.title()
305+
except:
306+
region = resp.get('countries')
307+
else:
308+
region = state
309+
print("Region: {}".format(region))
310+
category = resp.get('subtopics', "")
311+
profession = resp.get('professions')
312+
if category:
313+
category = getCategory(category)
314+
if not category:
315+
category = getProfession(profession)
316+
print("Category: {}".format(category))
317+
companies = resp.get('companies', [])
318+
companies = sorted(companies, key=lambda x: (
319+
x['level'], x['sublevel'], x['position']))
320+
for company in companies:
321+
company_name = company.get('name')
322+
print("Company: {}".format(company_name))
323+
company_uuid = company.get('uuid')
324+
company_slug = company.get('slug')
325+
company_url = getCompanyWebsite(company_uuid)
326+
if not company.get('isCompanyPage'):
327+
peoples = company.get('peoples', [])
328+
for people in peoples:
329+
people_uuid = people.get('uuid')
330+
if people_uuid is None:
331+
continue
332+
try:
333+
getPersonSingleDetail(
334+
people_uuid, company_name, company_url, parent_link, region, category)
335+
except:
336+
print("Error getting person data for person uuid {}".format(
337+
people_uuid))
338+
else:
339+
try:
340+
getPersonList(company_uuid, company_slug, company_name,
341+
company_url, parent_link, region, category)
342+
except:
343+
print("Error getting person list for company {}".format(company_name))
344+
345+
346+
def getCompanyWebsite(company_slug):
347+
link = f'https://api.leadersleague.com/firm/{company_slug}/aggregate'
348+
headers = {
349+
'accept': '*/*',
350+
'accept-encoding': 'gzip, deflate, br, zstd',
351+
'accept-language': 'en-US,en;q=0.9,bn;q=0.8',
352+
'cache-control': 'max-age=0',
353+
'dnt': '1',
354+
'priority': 'u=0, i',
355+
'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"',
356+
'sec-ch-ua-mobile': '?0',
357+
'sec-ch-ua-platform': '"Windows"',
358+
'sec-fetch-dest': 'document',
359+
'sec-fetch-mode': 'navigate',
360+
'sec-fetch-site': 'none',
361+
'sec-fetch-user': '?1',
362+
'upgrade-insecure-requests': '1',
363+
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
364+
}
365+
try:
366+
resp = requests.get(link, headers=headers).json()
367+
except:
368+
print("Failed to open {}".format(link))
369+
return ''
370+
return resp.get('websiteUrl')
371+
372+
373+
if __name__ == "__main__":
374+
links = open('links.txt', mode='r', encoding='utf-8').read().split('\n')
375+
for parent_link in links:
376+
if parent_link == "":
377+
continue
378+
print("Checking link {}".format(parent_link))
379+
converted_link = parent_link.split('/')
380+
if converted_link[-1] == "":
381+
converted_link = converted_link[:-2]
382+
else:
383+
converted_link = converted_link[-1]
384+
api_link = f"https://api.leadersleague.com/ranking/n9F4oEK34Rc-0/content/{
385+
converted_link}/aggregate"
386+
try:
387+
listCompanies(api_link, parent_link)
388+
except:
389+
print("An exception occured while processing company listing, log is below")
390+
import traceback
391+
traceback.print_exc()

0 commit comments

Comments
 (0)