-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from twiny/dev
v2
- Loading branch information
Showing
28 changed files
with
1,640 additions
and
1,088 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
cmd/tests | ||
config/config.yaml | ||
log/ | ||
result/ | ||
store/ | ||
bin/ | ||
bbolt/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,81 +1,54 @@ | ||
## Spidy | ||
Spidy is a tool that crawl web pages from a given list of websites, it match all domains on each page and find expired domains among them. | ||
|
||
# Usage | ||
compile the package | ||
` | ||
go build . | ||
` | ||
then run | ||
` | ||
./Spidy -config /path/to/setting.yaml | ||
` | ||
|
||
# Output/Results | ||
results will be saved in ./log folder: | ||
|
||
errors.txt: errors while scraping will be stored here. helpful for debugging. | ||
|
||
domains.txt: list of all unique domain checked. | ||
|
||
found.txt: list of the available domains found. | ||
|
||
visited.txt: list of all unique visited URLs. | ||
|
||
|
||
## Engine Setting: | ||
main app setting: | ||
|
||
**- worker :number of threads** | ||
|
||
example: worker:10 => scrap 10 urls at once. | ||
|
||
**- depth: page scraping depth** | ||
|
||
example: depth:5 => visit the link from | ||
the 1st page and follow link found in 2nd page | ||
till the 5th page | ||
|
||
**- parallel: number of processor** | ||
|
||
example: parallel:5 => on the scraped page process | ||
5 link at once. | ||
|
||
**- urls: path to a .txt file.** | ||
|
||
path to the input.txt which will have a URLs | ||
a new URL in each line. | ||
|
||
**- proxies: an array of proxy. accepts only HTTP proxies.** | ||
|
||
if no proxy is added. proxy scraping will be disabled. | ||
if one proxy is added. all scraping will be through one proxy. | ||
if more then two proxies added. scraping will be rotated. | ||
example: | ||
|
||
proxies: ["http://username:[email protected]:2345","http://username:password1.1.1.1:2345","http://username:password1.1.1.1:2345"] | ||
|
||
to disable able proxy, use empty array, like: | ||
proxies: [] | ||
|
||
|
||
**- tlds: an array of tld.** | ||
|
||
example: [com, net, org] | ||
|
||
an empty array will match all the 122 TLD in crawler/tld.go | ||
|
||
**- random_delay: time duration** | ||
|
||
a random time duration between requests | ||
example: 10s | ||
|
||
**- timeout: time duration** | ||
|
||
set timeout for HTTP requests | ||
example: 60s | ||
|
||
# Big Thanks | ||
Colly V2 => https://github.com/gocolly/colly | ||
|
||
[![Donate with Ethereum](https://en.cryptobadges.io/badge/small/0x94a003520Ad7F9aFF613c1cb6798a96256217EC9)](https://en.cryptobadges.io/donate/0x94a003520Ad7F9aFF613c1cb6798a96256217EC9) | ||
A tool that crawl websites to find domain names and checks thier availiabity. | ||
|
||
### Install | ||
|
||
```sh | ||
git clone https://github.com/twiny/spidy.git | ||
cd ./spidy | ||
|
||
# build | ||
go build -o bin/spidy -v cmd/spidy/main.go | ||
|
||
# run | ||
./bin/spidy -c config/config.yaml -u https://github.com | ||
``` | ||
|
||
## Configuration | ||
|
||
```yaml | ||
# main crawler config | ||
crawler: | ||
max_depth: 10 # max depth of pages to visit per website. | ||
# filter: [] # regexp filter | ||
rate_limit: "1/5s" # 1 request per 5 sec | ||
max_body_size: "20MB" # max page body size | ||
user_agents: # array of user-agents | ||
- "Spidy/2.1; +https://github.com/ twiny/spidy" | ||
# proxies: [] # array of proxy. http(s), SOCKS5 | ||
# Logs | ||
log: | ||
rotate: 7 # log rotation | ||
path: "./log" # log directory | ||
# Store | ||
store: | ||
ttl: "24h" # keep cache for 24h | ||
path: "./store" # store directory | ||
# Results | ||
result: | ||
path: ./result # result directory | ||
parralle: 3 # number of concurrent workers | ||
timeout: "5m" # request timeout | ||
tlds: ["biz", "cc", "com", "edu", "info", "net", "org", "tv"] # array of domain extension to check. | ||
``` | ||
## TODO | ||
- [ ] Add support to more `writers`. | ||
- [ ] Add terminal logging. | ||
- [ ] Add test cases. | ||
|
||
## Issues | ||
|
||
NOTE: This package is provided "as is" with no guarantee. Use it at your own risk and always test it yourself before using it in a production environment. If you find any issues, please create a new issue. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
package api | ||
|
||
import ( | ||
"context" | ||
_ "embed" | ||
"fmt" | ||
"log" | ||
"net/http" | ||
"os" | ||
"os/signal" | ||
"strconv" | ||
"sync" | ||
"syscall" | ||
|
||
// | ||
|
||
"github.com/twiny/spidy/v2/internal/pkg/spider/v1" | ||
"github.com/twiny/spidy/v2/internal/service/cache" | ||
"github.com/twiny/spidy/v2/internal/service/writer" | ||
|
||
// | ||
"github.com/twiny/domaincheck" | ||
"github.com/twiny/flog" | ||
"github.com/twiny/wbot" | ||
) | ||
|
||
//go:embed version | ||
var Version string | ||
|
||
// Spider | ||
type Spider struct { | ||
wg *sync.WaitGroup | ||
setting *spider.Setting | ||
bot *wbot.WBot | ||
pages chan *spider.Page | ||
check *domaincheck.Checker | ||
store spider.Storage | ||
write spider.Writer | ||
log *flog.Logger | ||
} | ||
|
||
// NewSpider | ||
func NewSpider(fp string) (*Spider, error) { | ||
// get settings | ||
setting := spider.ParseSetting(fp) | ||
|
||
// crawler opts | ||
opts := []wbot.Option{ | ||
wbot.SetParallel(setting.Parralle), | ||
wbot.SetMaxDepth(setting.Crawler.MaxDepth), | ||
wbot.SetRateLimit(setting.Crawler.Limit.Rate, setting.Crawler.Limit.Interval), | ||
wbot.SetMaxBodySize(setting.Crawler.MaxBodySize), | ||
wbot.SetUserAgents(setting.Crawler.UserAgents), | ||
wbot.SetProxies(setting.Crawler.Proxies), | ||
} | ||
|
||
bot := wbot.NewWBot(opts...) | ||
|
||
check, err := domaincheck.NewChecker() | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// store | ||
store, err := cache.NewCache(setting.Store.TTL, setting.Store.Path) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// logger | ||
log, err := flog.NewLogger(setting.Log.Path, "spidy", setting.Log.Rotate) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
write, err := writer.NewCSVWriter(setting.Result.Path) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
return &Spider{ | ||
wg: &sync.WaitGroup{}, | ||
setting: setting, | ||
bot: bot, | ||
pages: make(chan *spider.Page, setting.Parralle), | ||
check: check, | ||
store: store, | ||
write: write, | ||
log: log, | ||
}, nil | ||
} | ||
|
||
// Start | ||
func (s *Spider) Start(links []string) error { | ||
// go crawl | ||
s.wg.Add(len(links)) | ||
for _, link := range links { | ||
go func(l string) { | ||
defer s.wg.Done() | ||
// | ||
if err := s.bot.Crawl(l); err != nil { | ||
s.log.Error(err.Error(), map[string]string{"url": l}) | ||
} | ||
}(link) | ||
} | ||
|
||
// check domains | ||
s.wg.Add(s.setting.Parralle) | ||
for i := 0; i < s.setting.Parralle; i++ { | ||
go func() { | ||
defer s.wg.Done() | ||
// results | ||
for res := range s.bot.Stream() { | ||
// if response is ok | ||
if res.Status != http.StatusOK { | ||
s.log.Info("bad HTTP status", map[string]string{ | ||
"url": res.URL.String(), | ||
"status": strconv.Itoa(res.Status), | ||
}) | ||
continue | ||
} | ||
|
||
// extract domains | ||
domains := spider.FindDomains(res.Body) | ||
|
||
// check availability | ||
for _, domain := range domains { | ||
root := fmt.Sprintf("%s.%s", domain.Name, domain.TLD) | ||
|
||
// check if allowed extension | ||
if len(s.setting.TLDs) > 0 { | ||
if ok := s.setting.TLDs[domain.TLD]; !ok { | ||
s.log.Info("unsupported domain", map[string]string{ | ||
"domain": root, | ||
"url": res.URL.String(), | ||
}) | ||
continue | ||
} | ||
} | ||
|
||
// skip if already checked | ||
if s.store.HasChecked(root) { | ||
s.log.Info("already checked", map[string]string{ | ||
"domain": root, | ||
"url": res.URL.String(), | ||
}) | ||
continue | ||
} | ||
|
||
// | ||
ctx, cancel := context.WithTimeout(context.Background(), s.setting.Timeout) | ||
defer cancel() | ||
|
||
status, err := s.check.Check(ctx, root) | ||
if err != nil { | ||
s.log.Error(err.Error(), map[string]string{ | ||
"domain": root, | ||
"url": res.URL.String(), | ||
}) | ||
continue | ||
} | ||
|
||
// save domain | ||
if err := s.write.Write(&spider.Domain{ | ||
URL: res.URL.String(), | ||
Name: domain.Name, | ||
TLD: domain.TLD, | ||
Status: status.String(), | ||
}); err != nil { | ||
s.log.Error(err.Error(), map[string]string{ | ||
"domain": root, | ||
"url": res.URL.String(), | ||
}) | ||
continue | ||
} | ||
|
||
// terminal print | ||
fmt.Printf("[Spidy] == domain: %s - status %s\n", root, status.String()) | ||
} | ||
} | ||
}() | ||
} | ||
|
||
s.wg.Wait() | ||
return nil | ||
} | ||
|
||
// Shutdown | ||
func (s *Spider) Shutdown() error { | ||
// attempt graceful shutdown | ||
sigs := make(chan os.Signal, 1) | ||
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) | ||
|
||
<-sigs | ||
log.Println("shutting down ...") | ||
|
||
// 2nd ctrl+c kills program | ||
go func() { | ||
sigs := make(chan os.Signal, 1) | ||
signal.Notify(sigs, syscall.SIGHUP, syscall.SIGINT, syscall.SIGTERM, syscall.SIGQUIT) | ||
<-sigs | ||
log.Println("killing program ...") | ||
os.Exit(0) | ||
}() | ||
|
||
s.bot.Close() | ||
s.log.Close() | ||
if err := s.store.Close(); err != nil { | ||
return err | ||
} | ||
os.Exit(0) | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
2.0.0 |
Oops, something went wrong.