Skip to content

Commit

Permalink
refactor: 添加更多抓取异常情况处理
Browse files Browse the repository at this point in the history
  • Loading branch information
modood committed Feb 28, 2020
1 parent 5369f94 commit 66f42ae
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 3 deletions.
13 changes: 12 additions & 1 deletion lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ exports.fetch = (host, route, regexp, codeLen) =>
const bufferHelper = new BufferHelper()
const statusCode = res.statusCode

// 302 Move Temporarily
// 这种情况一般重试就可以了,所以视为超时统一重试处理
if (statusCode === 302) {
res.resume()
return reject(new Error('timeout'))
}

This comment has been minimized.

Copy link
@modood

modood Mar 3, 2020

Author Owner

相关 issue :#55

if (statusCode !== 200) {
res.resume()
return reject(new Error('Request Failed. Status Code: ' + statusCode))
Expand All @@ -49,7 +56,11 @@ exports.fetch = (host, route, regexp, codeLen) =>
let current
while ((current = regexp.exec(rawData)) !== null) result[current[1].substr(0, codeLen)] = current[2].trim()
if (Object.keys(result).length === 0) {
return reject(new Error('Request Failed. rawData: '), rawData)
const raw = iconv.decode(bufferHelper.toBuffer(), 'UTF-8')
if (raw.includes('请开启JavaScript并刷新该页')) {
console.log('\n温馨提示:请求过于频繁已被目标网站限制,当前抓取进度已保存,请五分钟后再试...\n')
process.exit(0)
}
}

return resolve(result)
Expand Down
16 changes: 14 additions & 2 deletions lib/worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ const limit = 100
* @datetime 2018-01-31 22:11
*/
exports.fetchProvinces = async () => {
const count = await Province.count()
if (count !== 0) {
return
}

console.log('[1/1]正在抓取省级数据...')
const o = await crawler.fetchProvinces()
const rows = []
Expand All @@ -29,12 +34,19 @@ exports.fetchProvinces = async () => {
exports.fetchCities = async () => {
await exports.fetchProvinces()

const count = await Province.count()
const fetchedProvinceCode = await City.aggregate('provinceCode', 'DISTINCT', { plain: false }).map(o => o.DISTINCT)
const where = { code: { [Sequelize.Op.notIn]: fetchedProvinceCode } }
const count = await Province.count({ where })

if (count === 0) {
return
}

let index = 0
let hasNext = true
let after
while (hasNext) {
const r = await Province.paginate({ limit, after })
const r = await Province.paginate({ where, limit, after })
const rows = []
for (let i = 0; i < r.results.length; i++) {
const { dataValues: {
Expand Down

0 comments on commit 66f42ae

Please sign in to comment.