Merge pull request #2 from twiny/dev

v2
twiny · Jul 11, 2022 · f31f292 · f31f292
2 parents 4370831 + 4230470
commit f31f292
Show file tree

Hide file tree

Showing 28 changed files with 1,640 additions and 1,088 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+cmd/tests
+config/config.yaml
+log/
+result/
+store/
+bin/
+bbolt/
diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020 Iss Meftah
+Copyright (c) 2022 Twiny
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -1,81 +1,54 @@
 ## Spidy
-Spidy is a tool that crawl web pages from a given list of websites, it match all domains on each page and find expired domains among them.
-
-# Usage
-compile the package
-`
-go build .
-`
-then run
-`
-./Spidy -config /path/to/setting.yaml
-`
-
-# Output/Results
-results will be saved in ./log folder:
-
-  errors.txt: errors while scraping will be stored here. helpful for debugging.
-
-  domains.txt: list of all unique domain checked.
-
-  found.txt: list of the available domains found.
-
-  visited.txt: list of all unique visited URLs.
-
-
-## Engine Setting:
-main app setting:
-
-  **- worker :number of threads**
-
-  example: worker:10 => scrap 10 urls at once.
-
-  **- depth: page scraping depth**
-
-  example: depth:5 => visit the link from
-  the 1st page and follow link found in 2nd page
-  till the 5th page
-
-  **- parallel: number of processor**
-
-  example: parallel:5 => on the scraped page process
-  5 link at once.
-
-  **- urls: path to a .txt file.**
-
-  path to the input.txt which will have a URLs 
-  a new URL in each line.
-
-  **- proxies: an array of proxy. accepts only HTTP proxies.**
-
-  if no proxy is added. proxy scraping will be disabled.
-  if one proxy is added. all scraping will be through one proxy.
-  if more then two proxies added. scraping will be rotated.
-  example:
-
-  proxies: ["http://username:[email protected]:2345","http://username:password1.1.1.1:2345","http://username:password1.1.1.1:2345"]
-
-  to disable able proxy, use empty array, like:
-  proxies: []
-
-
-  **- tlds: an array of tld.**
-
-  example: [com, net, org]
-
-  an empty array will match all the 122 TLD in crawler/tld.go
-
-  **- random_delay: time duration**
-
-  a random time duration between requests
-  example: 10s
-
-  **- timeout: time duration**
-
-  set timeout for HTTP requests
-  example: 60s
-
-  # Big Thanks
-  Colly V2 => https://github.com/gocolly/colly
-
-[![Donate with Ethereum](https://en.cryptobadges.io/badge/small/0x94a003520Ad7F9aFF613c1cb6798a96256217EC9)](https://en.cryptobadges.io/donate/0x94a003520Ad7F9aFF613c1cb6798a96256217EC9)
+A tool that crawl websites to find domain names and checks thier availiabity.
+
+### Install
+
+```sh
+git clone https://github.com/twiny/spidy.git
+cd ./spidy
+
+# build
+go build -o bin/spidy -v cmd/spidy/main.go
+
+# run
+./bin/spidy -c config/config.yaml -u https://github.com
+```
+
+## Configuration
+
+```yaml
+# main crawler config
+crawler:
+    max_depth: 10 # max depth of pages to visit per website.
+    # filter: [] # regexp filter
+    rate_limit: "1/5s" # 1 request per 5 sec
+    max_body_size: "20MB" # max page body size
+    user_agents: # array of user-agents
+      - "Spidy/2.1; +https://github.com/ twiny/spidy"
+    # proxies: [] # array of proxy. http(s), SOCKS5
+# Logs
+log:
+    rotate: 7 # log rotation
+    path: "./log" # log directory
+# Store
+store:
+    ttl: "24h" # keep cache for 24h 
+    path: "./store" # store directory
+# Results
+result:
+    path: ./result # result directory
+parralle: 3 # number of concurrent workers 
+timeout: "5m" # request timeout
+tlds: ["biz", "cc", "com", "edu", "info", "net", "org", "tv"] # array of domain extension to check.
+```
+
+
+## TODO
+
+- [ ] Add support to more `writers`.
+- [ ] Add terminal logging.
+- [ ] Add test cases.
+
+## Issues
+
+NOTE: This package is provided "as is" with no guarantee. Use it at your own risk and always test it yourself before using it in a production environment. If you find any issues, please create a new issue.
diff --git a/cmd/spidy/api/spider.go b/cmd/spidy/api/spider.go
@@ -0,0 +1,213 @@
+package api
+
+import (
+	"context"
+	_ "embed"
+	"fmt"
+	"log"
+	"net/http"
+	"os"
+	"os/signal"
+	"strconv"
+	"sync"
+	"syscall"
+
+	//
+
+	"github.com/twiny/spidy/v2/internal/pkg/spider/v1"
+	"github.com/twiny/spidy/v2/internal/service/cache"
+	"github.com/twiny/spidy/v2/internal/service/writer"
+
+	//
+	"github.com/twiny/domaincheck"
+	"github.com/twiny/flog"
+	"github.com/twiny/wbot"
+)
+
+//go:embed version
+var Version string
+
+// Spider
+type Spider struct {
+	wg      *sync.WaitGroup
+	setting *spider.Setting
+	bot     *wbot.WBot
+	pages   chan *spider.Page
+	check   *domaincheck.Checker
+	store   spider.Storage
+	write   spider.Writer
+	log     *flog.Logger
+}
+
+// NewSpider
+func NewSpider(fp string) (*Spider, error) {
+	// get settings
+	setting := spider.ParseSetting(fp)
+
+	// crawler opts
+	opts := []wbot.Option{
+		wbot.SetParallel(setting.Parralle),
+		wbot.SetMaxDepth(setting.Crawler.MaxDepth),
+		wbot.SetRateLimit(setting.Crawler.Limit.Rate, setting.Crawler.Limit.Interval),
+		wbot.SetMaxBodySize(setting.Crawler.MaxBodySize),
+		wbot.SetUserAgents(setting.Crawler.UserAgents),
+		wbot.SetProxies(setting.Crawler.Proxies),
+	}
+
+	bot := wbot.NewWBot(opts...)
+
+	check, err := domaincheck.NewChecker()
+	if err != nil {
+		return nil, err
+	}
+
+	// store
+	store, err := cache.NewCache(setting.Store.TTL, setting.Store.Path)
+	if err != nil {
+		return nil, err
+	}
+
+	// logger
+	log, err := flog.NewLogger(setting.Log.Path, "spidy", setting.Log.Rotate)
+	if err != nil {
+		return nil, err
+	}
+
+	write, err := writer.NewCSVWriter(setting.Result.Path)
+	if err != nil {
+		return nil, err
+	}
+
+	return &Spider{
+		wg:      &sync.WaitGroup{},
+		setting: setting,
+		bot:     bot,
+		pages:   make(chan *spider.Page, setting.Parralle),
+		check:   check,
+		store:   store,
+		write:   write,
+		log:     log,
+	}, nil
+}
+
+// Start
+func (s *Spider) Start(links []string) error {
+	// go crawl
+	s.wg.Add(len(links))
+	for _, link := range links {
+		go func(l string) {
+			defer s.wg.Done()
+			//
+			if err := s.bot.Crawl(l); err != nil {
+				s.log.Error(err.Error(), map[string]string{"url": l})
+			}
+		}(link)
+	}
+
+	// check domains
+	s.wg.Add(s.setting.Parralle)
+	for i := 0; i < s.setting.Parralle; i++ {
+		go func() {
+			defer s.wg.Done()
+			// results
+			for res := range s.bot.Stream() {
+				// if response is ok
+				if res.Status != http.StatusOK {
+					s.log.Info("bad HTTP status", map[string]string{
+						"url":    res.URL.String(),
+						"status": strconv.Itoa(res.Status),
+					})
+					continue
+				}
+
+				// extract domains
+				domains := spider.FindDomains(res.Body)
+
+				// check availability
+				for _, domain := range domains {
+					root := fmt.Sprintf("%s.%s", domain.Name, domain.TLD)
+
+					// check if allowed extension
+					if len(s.setting.TLDs) > 0 {
+						if ok := s.setting.TLDs[domain.TLD]; !ok {
+							s.log.Info("unsupported domain", map[string]string{
+								"domain": root,
+								"url":    res.URL.String(),
+							})
+							continue
+						}
+					}
+
+					// skip if already checked
+					if s.store.HasChecked(root) {
+						s.log.Info("already checked", map[string]string{
+							"domain": root,
+							"url":    res.URL.String(),
+						})
+						continue
+					}
+
+					//
+					ctx, cancel := context.WithTimeout(context.Background(), s.setting.Timeout)
+					defer cancel()
+
+					status, err := s.check.Check(ctx, root)
+					if err != nil {
+						s.log.Error(err.Error(), map[string]string{
+							"domain": root,
+							"url":    res.URL.String(),
+						})
+						continue
+					}
+
+					// save domain
+					if err := s.write.Write(&spider.Domain{
+						URL:    res.URL.String(),
+						Name:   domain.Name,
+						TLD:    domain.TLD,
+						Status: status.String(),
+					}); err != nil {
+						s.log.Error(err.Error(), map[string]string{
+							"domain": root,
+							"url":    res.URL.String(),
+						})
+						continue
+					}
+
+					// terminal print
+					fmt.Printf("[Spidy] == domain: %s - status %s\n", root, status.String())
+				}
+			}
+		}()
+	}
+
+	s.wg.Wait()
+	return nil
+}
+
+// Shutdown
+func (s *Spider) Shutdown() error {
+	// attempt graceful shutdown
+	sigs := make(chan os.Signal, 1)
+	signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
+
+	<-sigs
+	log.Println("shutting down ...")
+
+	// 2nd ctrl+c kills program
+	go func() {
+		sigs := make(chan os.Signal, 1)
+		signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
+		<-sigs
+		log.Println("killing program ...")
+		os.Exit(0)
+	}()
+
+	s.bot.Close()
+	s.log.Close()
+	if err := s.store.Close(); err != nil {
+		return err
+	}
+	os.Exit(0)
+	return nil
+}
diff --git a/cmd/spidy/api/version b/cmd/spidy/api/version
@@ -0,0 +1 @@
+2.0.0