This repository has been archived by the owner on Sep 30, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.js
85 lines (70 loc) · 2.96 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
const rp = require('request-promise');
const $ = require('cheerio');
const fs = require('fs');
const puppeteer = require('puppeteer')
let outputStream;
let recordCount = 0;
fs.readFile('./scraper-config.json', function (err, data) {
if (err) {
throw err;
}
let config = JSON.parse(data);
synchronousScrape(config);
});
function synchronousScrape(config) {
for (let groupName in config.groups) {
beginScraper(config.groups[groupName], groupName, config.baseUrl);
}
}
async function beginScraper(config, groupName, baseUrl) {
fs.writeFileSync(config.outputFile, '');
outputStream = fs.createWriteStream(config.outputFile);
let { scrape } = config;
for (let field in scrape) {
outputStream.write(scrape[field].name + ',');
}
outputStream.write('\n');
config.urls.forEach((pageUrl) =>
puppeteer
.launch()
.then(function (browser) {
return browser.newPage();
})
.then(function (page) {
return page.goto(baseUrl + pageUrl).then(function () {
return page.content();
});
})
.then(html => {
let content = $(config.selector, html);
for (let i = 0; i < content.length; i++) {
let record = {};
for (let field in scrape) {
let fieldConfig = scrape[field];
record[field] = $(content[i]).find(fieldConfig.selector);
if (scrape[field].hasOwnProperty('options')) {
if (scrape[field].options.hasOwnProperty('attribute')) {
if (record[field][0] == undefined || record[field][0].attribs[scrape[field].options.attribute] == undefined)
record[field] = '-'
else
record[field] = record[field][0].attribs[scrape[field].options.attribute];
}
if (scrape[field].options.hasOwnProperty('prependBaseUrl') && scrape[field].options.prependBaseUrl) {
record[field] = baseUrl + record[field];
}
if (scrape[field].options.hasOwnProperty('excludeSiblings') && scrape[field].options.excludeSiblings) {
record[field] = record[field].clone().children().remove().end().text().trim();
}
} else {
record[field] = "\"" + record[field].text().trim() + "\"";
}
}
console.log(++recordCount);
for (let field in record) {
outputStream.write(record[field] + ',');
}
outputStream.write('\n');
}
})
)
}