Skip to content

Commit

Permalink
Merge pull request #2 from twiny/dev
Browse files Browse the repository at this point in the history
v2
  • Loading branch information
twiny authored Jul 11, 2022
2 parents 4370831 + 4230470 commit f31f292
Show file tree
Hide file tree
Showing 28 changed files with 1,640 additions and 1,088 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
cmd/tests
config/config.yaml
log/
result/
store/
bin/
bbolt/
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MIT License

Copyright (c) 2020 Iss Meftah
Copyright (c) 2022 Twiny

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down
133 changes: 53 additions & 80 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,81 +1,54 @@
## Spidy
Spidy is a tool that crawl web pages from a given list of websites, it match all domains on each page and find expired domains among them.

# Usage
compile the package
`
go build .
`
then run
`
./Spidy -config /path/to/setting.yaml
`

# Output/Results
results will be saved in ./log folder:

errors.txt: errors while scraping will be stored here. helpful for debugging.

domains.txt: list of all unique domain checked.

found.txt: list of the available domains found.

visited.txt: list of all unique visited URLs.


## Engine Setting:
main app setting:

**- worker :number of threads**

example: worker:10 => scrap 10 urls at once.

**- depth: page scraping depth**

example: depth:5 => visit the link from
the 1st page and follow link found in 2nd page
till the 5th page

**- parallel: number of processor**

example: parallel:5 => on the scraped page process
5 link at once.

**- urls: path to a .txt file.**

path to the input.txt which will have a URLs
a new URL in each line.

**- proxies: an array of proxy. accepts only HTTP proxies.**

if no proxy is added. proxy scraping will be disabled.
if one proxy is added. all scraping will be through one proxy.
if more then two proxies added. scraping will be rotated.
example:

proxies: ["http://username:[email protected]:2345","http://username:password1.1.1.1:2345","http://username:password1.1.1.1:2345"]

to disable able proxy, use empty array, like:
proxies: []


**- tlds: an array of tld.**

example: [com, net, org]

an empty array will match all the 122 TLD in crawler/tld.go

**- random_delay: time duration**

a random time duration between requests
example: 10s

**- timeout: time duration**

set timeout for HTTP requests
example: 60s

# Big Thanks
Colly V2 => https://github.com/gocolly/colly

[![Donate with Ethereum](https://en.cryptobadges.io/badge/small/0x94a003520Ad7F9aFF613c1cb6798a96256217EC9)](https://en.cryptobadges.io/donate/0x94a003520Ad7F9aFF613c1cb6798a96256217EC9)
A tool that crawl websites to find domain names and checks thier availiabity.

### Install

```sh
git clone https://github.com/twiny/spidy.git
cd ./spidy

# build
go build -o bin/spidy -v cmd/spidy/main.go

# run
./bin/spidy -c config/config.yaml -u https://github.com
```

## Configuration

```yaml
# main crawler config
crawler:
max_depth: 10 # max depth of pages to visit per website.
# filter: [] # regexp filter
rate_limit: "1/5s" # 1 request per 5 sec
max_body_size: "20MB" # max page body size
user_agents: # array of user-agents
- "Spidy/2.1; +https://github.com/ twiny/spidy"
# proxies: [] # array of proxy. http(s), SOCKS5
# Logs
log:
rotate: 7 # log rotation
path: "./log" # log directory
# Store
store:
ttl: "24h" # keep cache for 24h
path: "./store" # store directory
# Results
result:
path: ./result # result directory
parralle: 3 # number of concurrent workers
timeout: "5m" # request timeout
tlds: ["biz", "cc", "com", "edu", "info", "net", "org", "tv"] # array of domain extension to check.
```
## TODO
- [ ] Add support to more `writers`.
- [ ] Add terminal logging.
- [ ] Add test cases.

## Issues

NOTE: This package is provided "as is" with no guarantee. Use it at your own risk and always test it yourself before using it in a production environment. If you find any issues, please create a new issue.
213 changes: 213 additions & 0 deletions cmd/spidy/api/spider.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
package api

import (
"context"
_ "embed"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"strconv"
"sync"
"syscall"

//

"github.com/twiny/spidy/v2/internal/pkg/spider/v1"
"github.com/twiny/spidy/v2/internal/service/cache"
"github.com/twiny/spidy/v2/internal/service/writer"

//
"github.com/twiny/domaincheck"
"github.com/twiny/flog"
"github.com/twiny/wbot"
)

//go:embed version
var Version string

// Spider
type Spider struct {
wg *sync.WaitGroup
setting *spider.Setting
bot *wbot.WBot
pages chan *spider.Page
check *domaincheck.Checker
store spider.Storage
write spider.Writer
log *flog.Logger
}

// NewSpider
func NewSpider(fp string) (*Spider, error) {
// get settings
setting := spider.ParseSetting(fp)

// crawler opts
opts := []wbot.Option{
wbot.SetParallel(setting.Parralle),
wbot.SetMaxDepth(setting.Crawler.MaxDepth),
wbot.SetRateLimit(setting.Crawler.Limit.Rate, setting.Crawler.Limit.Interval),
wbot.SetMaxBodySize(setting.Crawler.MaxBodySize),
wbot.SetUserAgents(setting.Crawler.UserAgents),
wbot.SetProxies(setting.Crawler.Proxies),
}

bot := wbot.NewWBot(opts...)

check, err := domaincheck.NewChecker()
if err != nil {
return nil, err
}

// store
store, err := cache.NewCache(setting.Store.TTL, setting.Store.Path)
if err != nil {
return nil, err
}

// logger
log, err := flog.NewLogger(setting.Log.Path, "spidy", setting.Log.Rotate)
if err != nil {
return nil, err
}

write, err := writer.NewCSVWriter(setting.Result.Path)
if err != nil {
return nil, err
}

return &Spider{
wg: &sync.WaitGroup{},
setting: setting,
bot: bot,
pages: make(chan *spider.Page, setting.Parralle),
check: check,
store: store,
write: write,
log: log,
}, nil
}

// Start
func (s *Spider) Start(links []string) error {
// go crawl
s.wg.Add(len(links))
for _, link := range links {
go func(l string) {
defer s.wg.Done()
//
if err := s.bot.Crawl(l); err != nil {
s.log.Error(err.Error(), map[string]string{"url": l})
}
}(link)
}

// check domains
s.wg.Add(s.setting.Parralle)
for i := 0; i < s.setting.Parralle; i++ {
go func() {
defer s.wg.Done()
// results
for res := range s.bot.Stream() {
// if response is ok
if res.Status != http.StatusOK {
s.log.Info("bad HTTP status", map[string]string{
"url": res.URL.String(),
"status": strconv.Itoa(res.Status),
})
continue
}

// extract domains
domains := spider.FindDomains(res.Body)

// check availability
for _, domain := range domains {
root := fmt.Sprintf("%s.%s", domain.Name, domain.TLD)

// check if allowed extension
if len(s.setting.TLDs) > 0 {
if ok := s.setting.TLDs[domain.TLD]; !ok {
s.log.Info("unsupported domain", map[string]string{
"domain": root,
"url": res.URL.String(),
})
continue
}
}

// skip if already checked
if s.store.HasChecked(root) {
s.log.Info("already checked", map[string]string{
"domain": root,
"url": res.URL.String(),
})
continue
}

//
ctx, cancel := context.WithTimeout(context.Background(), s.setting.Timeout)
defer cancel()

status, err := s.check.Check(ctx, root)
if err != nil {
s.log.Error(err.Error(), map[string]string{
"domain": root,
"url": res.URL.String(),
})
continue
}

// save domain
if err := s.write.Write(&spider.Domain{
URL: res.URL.String(),
Name: domain.Name,
TLD: domain.TLD,
Status: status.String(),
}); err != nil {
s.log.Error(err.Error(), map[string]string{
"domain": root,
"url": res.URL.String(),
})
continue
}

// terminal print
fmt.Printf("[Spidy] == domain: %s - status %s\n", root, status.String())
}
}
}()
}

s.wg.Wait()
return nil
}

// Shutdown
func (s *Spider) Shutdown() error {
// attempt graceful shutdown
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)

<-sigs
log.Println("shutting down ...")

// 2nd ctrl+c kills program
go func() {
sigs := make(chan os.Signal, 1)
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT)
<-sigs
log.Println("killing program ...")
os.Exit(0)
}()

s.bot.Close()
s.log.Close()
if err := s.store.Close(); err != nil {
return err
}
os.Exit(0)
return nil
}
1 change: 1 addition & 0 deletions cmd/spidy/api/version
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
2.0.0
Loading

0 comments on commit f31f292

Please sign in to comment.