Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

async, await 练习- 存储关注专栏里面的前3篇文章 #22

Open
AnnVoV opened this issue Dec 5, 2018 · 0 comments
Open

async, await 练习- 存储关注专栏里面的前3篇文章 #22

AnnVoV opened this issue Dec 5, 2018 · 0 comments

Comments

@AnnVoV
Copy link
Owner

AnnVoV commented Dec 5, 2018

const request = require('request-promise-native');
const cheerio = require('cheerio');
const config = require('../config');
const zhihuRoot = config.zhihu.root;
const pageSize = config.page.pageSize;
const ColumnModel = require('../model/column');
const ContentModel = require('../model/content');

const USER_NAME = 'anran-0423';

// db start
const {host, database, port} = config.db;
const mongoose = require('mongoose');
mongoose.Promise = global.Promise;
mongoose.connect(host, database, port);

const exploreColumns = async (offset, limit) => {
  // 获取我offset页开始的 每页limit的专栏数据
  const paramObj = [`offset=${offset}`, `limit=${limit}`].join('&');
  const options = {
    method: 'GET',
    uri: `https://www.zhihu.com/api/v4/members/${USER_NAME}/following-columns?${paramObj}`,
    json: true,
  };
  const rsData = await request(options);
  const promiseArr = rsData.data.map(async (column) => {
    return ColumnModel
            .findOneAndUpdate({id: column.id}, column, {upsert: true, new: true})
            .exec();
    // 存储专栏相关的数据 这里涉及到findOneAndUpdate 与 update 方法的区别
    // https://segmentfault.com/a/1190000009706886,
    // Mongoose: findOneAndUpdate doesn't return updated document
    // https://stackoverflow.com/questions/32811510/mongoose-findoneandupdate-doesnt-return-updated-document
  });
  return Promise.all(promiseArr);
};

const getArticledData = (column) => {
  // 获取专栏里的最新的一篇文章数据
  return new Promise((resolve, reject) => {
    const uri = `https://zhuanlan.zhihu.com/api2/columns/${column.id}/articles`;
    const options = {
      uri,
      json: true,
    };
    request(options)
    .then((res) => {
      // 取每个文章的前3个
      const result = res.data.slice(0, 3).map((arr) => {
        arr.columnId = column.id;
        return arr;
      });
      resolve(result);
    });
  });
};

const getPageSize = () => {
  return new Promise((resolve) => {
    // 获取关注的专栏的页码数
    request(`${zhihuRoot}/people/anran-0423/following/columns`)
    .then((res) => {
      const $ = cheerio.load(res);
      const jsonData = JSON.parse($('#js-initialData').html());
      const data = jsonData.initialState.entities.users;
      resolve(data[USER_NAME].followingColumnsCount);
    })
    .catch((err) => {
      console.log(err);
    });
  });
};

const saveArticles = (articleArr, column) => {
  const promiseArr = articleArr.map(async (article) => {
    article.columnId = column._id;
    ContentModel
                .update({id: article.id}, article, {upsert: true})
                .exec();
  });
  return Promise.all(promiseArr);
};

const init = async () => {
  const allNum = await getPageSize();
  const pageCount = Math.ceil(allNum / pageSize);
  let pageArr = Array.from(new Array(pageCount), (val, index) => index);

  pageArr = pageArr.map(async (cur) => {
    const startPage = cur * pageSize;
    const endPage = cur * pageSize + pageSize;
    const columns = await exploreColumns(startPage, endPage);
    const articleArrs = columns.map(async (column) => {
      const articleArr = await getArticledData(column);
      saveArticles(articleArr, column);
    });
    return Promise.all(articleArrs);
  });

  Promise.all(pageArr)
          .then(() => {
            console.log('抓取数据成功!');
          })
          .catch((err) => {
            console.log(err);
          });
};

init();

主要遇到的问题:
Using async/await with a forEach loop
https://stackoverflow.com/questions/37576685/using-async-await-with-a-foreach-loop

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant