diff --git a/common/src/main/java/com/quick/hui/crawler/common/FileReadUtil.java b/common/src/main/java/com/quick/hui/crawler/common/FileReadUtil.java new file mode 100644 index 0000000..dc4575c --- /dev/null +++ b/common/src/main/java/com/quick/hui/crawler/common/FileReadUtil.java @@ -0,0 +1,99 @@ +package com.quick.hui.crawler.common; + + +import java.io.*; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +/** + * Created by yihui on 2017/5/6. + */ +public class FileReadUtil { + + /** + * 以字节为单位读取文件,常用于读二进制文件,如图片、声音、影像等文件。 + * + * @param fileName 文件的名 + */ + public static InputStream createByteRead(String fileName) throws IOException { + +// File file = new File(fileName); +// +// return new FileInputStream(file); + return getStreamByFileName(fileName); + } + + + /** + * 以字符为单位读取文件,常用于读文本,数字等类型的文件 + * + * @param fileName 文件名 + */ + public static Reader createCharRead(String fileName) throws IOException { +// File file = new File(fileName); +// return new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8")); + + return new InputStreamReader(getStreamByFileName(fileName), Charset.forName("UTF-8")); + } + + + /** + * 以行为单位读取文件,常用于读面向行的格式化文件 + * + * @param fileName 文件名 + */ + public static BufferedReader createLineRead(String fileName) throws IOException { +// File file = new File(fileName); +//// return new BufferedReader(new FileReader(file)); +// return new BufferedReader(new InputStreamReader(new FileInputStream(file), Charset.forName("UTF-8"))); + + return new BufferedReader(new InputStreamReader(getStreamByFileName(fileName), Charset.forName("UTF-8"))); + } + + + public static InputStream getStreamByFileName(String fileName) throws IOException { + check(fileName); + + if (fileName.startsWith("http")) { // 网络地址 + URL url = new URL(fileName); + return url.openStream(); + } else if (fileName.startsWith("/")) { // 绝对路径 + Path path = Paths.get(fileName); + return Files.newInputStream(path); + } else { // 相对路径 + return FileReadUtil.class.getClassLoader().getResourceAsStream(fileName); + } + } + + + public static File getFile(String fileName) throws IOException { + check(fileName); + + if (fileName.startsWith("http")) { // 网络地址 + URL url = new URL(fileName); + fileName = url.getFile(); + } else if (!fileName.startsWith("/")){ // 相对路径 + URL url = FileReadUtil.class.getClassLoader().getResource(fileName); + check(url, "System do not have this file : " + fileName); + fileName = url.getFile(); + } + + return new File(fileName); + } + + + + private static void check(Object arg) { + check(arg, "params should not be null!"); + } + + private static void check(Object arg, String msg) { + if (arg == null) { + throw new IllegalArgumentException(msg); + } + } + +} diff --git a/core/pom.xml b/core/pom.xml index 6023b05..b711371 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -63,6 +63,21 @@ logback-classic 1.1.7 + + com.quick.hui.crawler + common + 1.0-SNAPSHOT + + + com.google.guava + guava + 21.0 + + + commons-io + commons-io + 2.5 + diff --git a/core/src/main/java/com/quick/hui/crawler/core/conf/Config.java b/core/src/main/java/com/quick/hui/crawler/core/conf/Config.java new file mode 100644 index 0000000..ce31711 --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/conf/Config.java @@ -0,0 +1,35 @@ +package com.quick.hui.crawler.core.conf; + +import com.quick.hui.crawler.core.utils.NumUtils; +import lombok.Getter; +import lombok.Setter; +import lombok.ToString; + +/** + * Created by yihui on 2017/7/8. + */ +@Getter +@Setter +@ToString +public class Config { + + /** + * 爬取任务的间隔时间 + */ + private long sleep; + + + /** + * 从队列中获取任务,返回空时,等待时间之后再进行重试 + */ + private long emptyQueueWaitTime; + + + public void setSleep(String str, long sleep) { + this.sleep = NumUtils.str2long(str, sleep); + } + + public void setEmptyQueueWaitTime(String str, long emptyQueueWaitTime) { + this.emptyQueueWaitTime = NumUtils.str2long(str, emptyQueueWaitTime); + } +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/conf/ConfigWrapper.java b/core/src/main/java/com/quick/hui/crawler/core/conf/ConfigWrapper.java new file mode 100644 index 0000000..2ce7c1e --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/conf/ConfigWrapper.java @@ -0,0 +1,81 @@ +package com.quick.hui.crawler.core.conf; + +import com.google.common.eventbus.EventBus; +import com.google.common.eventbus.Subscribe; +import com.quick.hui.crawler.core.conf.file.FileConfRead; +import lombok.Getter; +import lombok.Setter; +import lombok.extern.slf4j.Slf4j; + + +/** + * Created by yihui on 2017/7/8. + */ +@Slf4j +public class ConfigWrapper { + private static final String CONFIG_PATH = "conf/crawler.properties"; + + private EventBus eventBus; + + + private IConfRead confRead; + + private Config config; + + private static volatile ConfigWrapper instance; + + private ConfigWrapper() { + confRead = new FileConfRead(); + confRead.registerCheckTask(CONFIG_PATH); + config = confRead.initConf(CONFIG_PATH); + + + // 注册监听器 + eventBus = new EventBus(); + eventBus.register(this); + } + + + public static ConfigWrapper getInstance() { + if (instance == null) { + synchronized (ConfigWrapper.class) { + if (instance == null) { + instance = new ConfigWrapper(); + } + } + } + + return instance; + } + + + @Subscribe + public void init(UpdateConfEvent event) { + config = confRead.initConf(event.conf); + + if (log.isDebugEnabled()) { + log.debug("time:{} processor:{} update config! new config is: {}", + event.now, event.operator, config); + } + } + + + public Config getConfig() { + return config; + } + + + public void post(Object event) { + eventBus.post(event); + } + + @Getter + @Setter + public static class UpdateConfEvent { + private long now = System.currentTimeMillis(); + + private String operator = "System"; + + private String conf = CONFIG_PATH; + } +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/conf/IConfRead.java b/core/src/main/java/com/quick/hui/crawler/core/conf/IConfRead.java new file mode 100644 index 0000000..f1d3760 --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/conf/IConfRead.java @@ -0,0 +1,25 @@ +package com.quick.hui.crawler.core.conf; + +/** + * 读取配置文件的接口 + *

+ * Created by yihui on 2017/7/8. + */ +public interface IConfRead { + + /** + * 初始化配置信息 + * + * @param var + * @return + */ + Config initConf(String var); + + + /** + * 注册配置信息更新检测任务 + * + * @param path + */ + void registerCheckTask(final String path); +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/conf/file/FileConfRead.java b/core/src/main/java/com/quick/hui/crawler/core/conf/file/FileConfRead.java new file mode 100644 index 0000000..b2962db --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/conf/file/FileConfRead.java @@ -0,0 +1,75 @@ +package com.quick.hui.crawler.core.conf.file; + +import com.quick.hui.crawler.common.FileReadUtil; +import com.quick.hui.crawler.core.conf.Config; +import com.quick.hui.crawler.core.conf.ConfigWrapper; +import com.quick.hui.crawler.core.conf.IConfRead; +import lombok.extern.slf4j.Slf4j; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Properties; +import java.util.concurrent.Executors; +import java.util.concurrent.ScheduledExecutorService; +import java.util.concurrent.TimeUnit; + +/** + * 从配置文件中获取配置信息 + *

+ * Created by yihui on 2017/7/8. + */ +@Slf4j +public class FileConfRead implements IConfRead { + + + public Config initConf(String path) { + try { + Properties properties = read(path); + + Config config = new Config(); + config.setSleep(properties.getProperty("sleep"), 0); + config.setEmptyQueueWaitTime(properties.getProperty("emptyQueueWaitTime"), 200); + + return config; + } catch (Exception e) { + log.error("init config from file: {} error! e: {}", path, e); + return new Config(); + } + } + + + private Properties read(String fileName) throws IOException { + try (InputStream inputStream = FileReadUtil.getStreamByFileName(fileName)) { + Properties pro = new Properties(); + pro.load(inputStream); + return pro; + } + } + + + private File file; + private long lastTime; + + public void registerCheckTask(final String path) { + try { + file = FileReadUtil.getFile(path); + lastTime = file.lastModified(); + + + ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1); + scheduledExecutorService.scheduleAtFixedRate(() -> { + if (file.lastModified() > lastTime) { + lastTime = file.lastModified(); + ConfigWrapper.getInstance().post(new ConfigWrapper.UpdateConfEvent()); + } + }, + 1, + 1, + TimeUnit.MINUTES); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + +} diff --git a/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java index 8fcbaac..21b91d4 100644 --- a/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java +++ b/core/src/main/java/com/quick/hui/crawler/core/fetcher/Fetcher.java @@ -1,5 +1,6 @@ package com.quick.hui.crawler.core.fetcher; +import com.quick.hui.crawler.core.conf.ConfigWrapper; import com.quick.hui.crawler.core.entity.CrawlMeta; import com.quick.hui.crawler.core.job.DefaultAbstractCrawlJob; import lombok.*; @@ -74,11 +75,23 @@ public void start(Class clz) throws Excep while (!fetchQueue.isOver) { crawlMeta = fetchQueue.pollSeed(); if (crawlMeta == null) { - Thread.sleep(200); + Thread.sleep(ConfigWrapper.getInstance().getConfig().getEmptyQueueWaitTime()); continue; } + try { + long sleep = ConfigWrapper.getInstance().getConfig().getSleep(); + Thread.sleep(sleep); + + if (log.isDebugEnabled()) { + log.debug("Sleep {} ms", sleep); + } + } catch (Exception e) { + log.error("fetcher sleep exception! e:{} ", e); + } + + DefaultAbstractCrawlJob job = clz.newInstance(); job.setDepth(this.maxDepth); job.setCrawlMeta(crawlMeta); diff --git a/core/src/main/java/com/quick/hui/crawler/core/utils/NumUtils.java b/core/src/main/java/com/quick/hui/crawler/core/utils/NumUtils.java new file mode 100644 index 0000000..73617b9 --- /dev/null +++ b/core/src/main/java/com/quick/hui/crawler/core/utils/NumUtils.java @@ -0,0 +1,27 @@ +package com.quick.hui.crawler.core.utils; + +import lombok.extern.slf4j.Slf4j; + +/** + * Created by yihui on 2017/7/12. + */ +@Slf4j +public class NumUtils { + + public static long str2long(String str, long defaultValue) { + if (str == null) { + return defaultValue; + } + + + try { + return Long.parseLong(str.trim()); + } catch (Exception e) { + if (log.isDebugEnabled()) { + log.debug("parse str{} to long error! return defaultValue: {}", str, defaultValue); + } + return defaultValue; + } + } + +} diff --git a/core/src/main/resources/conf/crawler.properties b/core/src/main/resources/conf/crawler.properties new file mode 100644 index 0000000..a9a6726 --- /dev/null +++ b/core/src/main/resources/conf/crawler.properties @@ -0,0 +1 @@ +sleep=10 \ No newline at end of file diff --git a/core/src/test/java/com/quick/hui/crawler/core/test/conf/ConfWrapperTest.java b/core/src/test/java/com/quick/hui/crawler/core/test/conf/ConfWrapperTest.java new file mode 100644 index 0000000..0560502 --- /dev/null +++ b/core/src/test/java/com/quick/hui/crawler/core/test/conf/ConfWrapperTest.java @@ -0,0 +1,21 @@ +package com.quick.hui.crawler.core.test.conf; + +import com.quick.hui.crawler.core.conf.Config; +import com.quick.hui.crawler.core.conf.ConfigWrapper; +import lombok.extern.slf4j.Slf4j; +import org.junit.Test; + +/** + * Created by yihui on 2017/7/8. + */ +@Slf4j +public class ConfWrapperTest { + + + @Test + public void testConfLoad() { + Config config = ConfigWrapper.getInstance().getConfig(); + log.info("the config : {}", config); + } + +}