-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathseniorly_scraper.py
121 lines (84 loc) · 3.28 KB
/
seniorly_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import requests
import bs4
import re
import json
import pandas as pd
def listToString(s):
# initialize an empty string
str1 =''
# return string
for i in s:
str1+=i+', '
return str1[:-2]
def pageRequests(url):
r=requests.get(url)
soup = bs4.BeautifulSoup(r.text, 'lxml')
return soup
def getPageTitle(soup):
return soup.title.string.strip()
def getMetaDescription(soup):
meta = soup.find_all('meta')
return listToString([meta.attrs['content'] for meta in meta if 'name' in meta.attrs and meta.attrs['name'] == 'description'])
def getCommunityName(soup):
return soup.find(class_='Root__H1-sc-1lnoi1-0 iCGCDp Root-sc-1lnoi1-4 eaYBkV Heading-p8la3x-0 irVoEB').get_text().strip()
def getCommunityStreetAddress(soup):
return soup.find(class_='Root__H2-sc-1lnoi1-1 eaHtqb Root-sc-1lnoi1-4 jBwoEe Heading-p8la3x-0 irVoEB').get_text().strip()
def getCommunityZipCode(soup):
data=soup.find(class_='Root__H2-sc-1lnoi1-1 eaHtqb Root-sc-1lnoi1-4 jBwoEe Heading-p8la3x-0 irVoEB').get_text().strip()
return re.sub("[^\w]", " ", data).split()[-1]
return city
def getCommunityState(soup):
data = soup.find(
class_='Root__H2-sc-1lnoi1-1 eaHtqb Root-sc-1lnoi1-4 jBwoEe Heading-p8la3x-0 irVoEB').get_text().strip()
state= ''
return re.sub("[^\w]", " ", data).split()[-2]
def getCommunityCity(soup):
data = soup.find(
class_='Root__H2-sc-1lnoi1-1 eaHtqb Root-sc-1lnoi1-4 jBwoEe Heading-p8la3x-0 irVoEB').get_text().strip()
state = ''
return re.sub("[^\w]", " ", data).split()[-3]
def getImages(soup):
dt=[]
for i in soup.findAll(class_='ResponsiveImage__ResponsiveWrapper-l4g8bp-0 jscNWe'):
dt.append(i.find('img').get('data-src'))
if not dt:
return -1
else:
return listToString(dt)
def getCommunityContent(soup):
try:
return soup.find(class_='CollapsibleBlock__BlockCap-s326rf-1 jhhVKz').get_text().strip()
except:
return -1
def getCareTypesProvided(soup):
try:
dt=[]
for i in soup.find(class_='CommunityCareService__Wrapper-sc-1uu6i2h-0 bDIQtl').findAll(class_='Root-sc-1m9vk1w-0 gCatle Block-sc-1184las-0 IconItem__Wrapper-sc-155qj67-0 ivNDcb'):
dt.append(i.find(class_='Root-sc-1m9vk1w-0 gCatle Block-sc-1184las-0 jqgkYL').get_text().strip())
return listToString(dt)
except:
return -1
def getAmenitiesProvided(soup):
try:
dt=[]
for i in soup.find(class_='CommunityAmenities__Wrapper-sc-1gihmpf-0 RJvAA').findAll(class_='Root-sc-1m9vk1w-0 gCatle Block-sc-1184las-0 IconItem__Wrapper-sc-155qj67-0 ivNDcb'):
dt.append(i.find(class_='Root-sc-1m9vk1w-0 gCatle Block-sc-1184las-0 jqgkYL').get_text().strip())
return listToString(dt)
except:
return -1
def getPricingStartsFrom(soup):
try:
return soup.find(class_='Span-vzvmw4-0 jYbTJi').get_text().strip()
except:
return -1
def getPricingByRoomType(soup):
try:
dt=[]
for i in soup.find(class_='CommunityPricingTable__StyledTable-r1omm4-4 lbwBwC').findAll('tr'):
for k in i.findAll('td'):
dt.append(k.get_text())
dt.remove('Type')
dt.remove('Average Monthly Cost*')
return listToString(dt)
except:
return -1