diff --git a/go.mod b/go.mod
index 2858bab0..aff40ddd 100644
--- a/go.mod
+++ b/go.mod
@@ -3,7 +3,6 @@ module github.com/internetarchive/Zeno
go 1.22.4
require (
- github.com/internetarchive/gocrawlhq v1.2.14
github.com/CorentinB/warc v0.8.53
github.com/PuerkitoBio/goquery v1.9.3
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
@@ -14,6 +13,7 @@ require (
github.com/gosuri/uilive v0.0.4
github.com/gosuri/uitable v0.0.4
github.com/grafov/m3u8 v0.12.0
+ github.com/internetarchive/gocrawlhq v1.2.14
github.com/paulbellamy/ratecounter v0.2.0
github.com/philippgille/gokv/leveldb v0.7.0
github.com/prometheus/client_golang v1.20.4
@@ -32,6 +32,7 @@ require (
require (
github.com/andybalholm/brotli v1.1.0 // indirect
github.com/andybalholm/cascadia v1.3.2 // indirect
+ github.com/aws/aws-sdk-go v1.55.5 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/cloudflare/circl v1.4.0 // indirect
@@ -49,6 +50,7 @@ require (
github.com/google/go-cmp v0.6.0 // indirect
github.com/hashicorp/hcl v1.0.0 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
+ github.com/jmespath/go-jmespath v0.4.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.10 // indirect
github.com/magiconair/properties v1.8.7 // indirect
diff --git a/go.sum b/go.sum
index 954b78f1..3ee6b96a 100644
--- a/go.sum
+++ b/go.sum
@@ -16,6 +16,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
+github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
+github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
@@ -82,6 +84,9 @@ github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGM
github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s=
github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc=
+github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
+github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
+github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0=
@@ -264,6 +269,7 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go
index cfefd00a..0d3be9c4 100644
--- a/internal/pkg/crawl/capture.go
+++ b/internal/pkg/crawl/capture.go
@@ -457,16 +457,25 @@ func (c *Crawl) Capture(item *queue.Item) error {
}
// If the response is an XML document, we want to scrape it for links
+ var outlinks []*url.URL
if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
- URLsFromXML, isSitemap, err := extractor.XML(resp)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
+ if extractor.IsS3(resp) {
+ URLsFromS3, err := extractor.S3(resp)
+ if err != nil {
+ c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting URLs from S3")
+ }
+
+ outlinks = append(outlinks, URLsFromS3...)
} else {
- if isSitemap {
- waitGroup.Add(1)
- go c.queueOutlinks(URLsFromXML, item, &waitGroup)
+ URLsFromXML, isSitemap, err := extractor.XML(resp)
+ if err != nil {
+ c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
} else {
- assets = append(assets, URLsFromXML...)
+ if isSitemap {
+ outlinks = append(outlinks, URLsFromXML...)
+ } else {
+ assets = append(assets, URLsFromXML...)
+ }
}
}
} else if strings.Contains(resp.Header.Get("Content-Type"), "json") {
@@ -488,111 +497,106 @@ func (c *Crawl) Capture(item *queue.Item) error {
}
return err
- }
-
- // Turn the response into a doc that we will scrape for outlinks and assets.
- doc, err := goquery.NewDocumentFromReader(resp.Body)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while creating goquery document")
- return err
- }
-
- // Execute site-specific code on the document
- if cloudflarestream.IsURL(base.Host) {
- // Look for JS files necessary for the playback of the video
- cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client)
+ } else {
+ // Turn the response into a doc that we will scrape for outlinks and assets.
+ doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting JS files from cloudflarestream")
+ c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while creating goquery document")
return err
}
- // Seencheck the URLs we captured, we ignore the returned value here
- // because we already archived the URLs, we just want them to be added
- // to the seencheck table.
- if c.UseSeencheck {
- if c.UseHQ {
- _, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs))
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{
- "urls": cfstreamURLs,
- })).Error("error while seenchecking assets via HQ")
- }
- } else {
- for _, cfstreamURL := range cfstreamURLs {
- c.Seencheck.SeencheckURL(cfstreamURL, "asset")
- }
- }
- }
- // Log the archived URLs
- for _, cfstreamURL := range cfstreamURLs {
- c.Log.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{
- "parentHop": item.Hop,
- "parentUrl": utils.URLToString(item.URL),
- "type": "asset",
- })).Info("URL archived")
- }
- } else if ina.IsURL(req) {
- playerURLs := ina.ExtractPlayerURLs(doc, c.Client)
-
- for _, playerURL := range playerURLs {
- playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false)
+ // Execute site-specific code on the document
+ if cloudflarestream.IsURL(utils.URLToString(item.URL)) {
+ // Look for JS files necessary for the playback of the video
+ cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client)
if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from player URL")
- } else {
- c.Capture(playerItem)
+ c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting JS files from cloudflarestream")
+ return err
}
- }
- }
-
- // Websites can use a tag to specify a base for relative URLs in every other tags.
- // This checks for the "base" tag and resets the "base" URL variable with the new base URL specified
- // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
- if !utils.StringInSlice("base", c.DisabledHTMLTags) {
- oldBase := base
- doc.Find("base").Each(func(index int, goitem *goquery.Selection) {
- // If a new base got scraped, stop looking for one
- if oldBase != base {
- return
+ // Seencheck the URLs we captured, we ignore the returned value here
+ // because we already archived the URLs, we just want them to be added
+ // to the seencheck table.
+ if c.UseSeencheck {
+ if c.UseHQ {
+ _, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs))
+ if err != nil {
+ c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{
+ "urls": cfstreamURLs,
+ })).Error("error while seenchecking assets via HQ")
+ }
+ } else {
+ for _, cfstreamURL := range cfstreamURLs {
+ c.Seencheck.SeencheckURL(cfstreamURL, "asset")
+ }
+ }
}
+ // Log the archived URLs
+ for _, cfstreamURL := range cfstreamURLs {
+ c.Log.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{
+ "parentHop": item.Hop,
+ "parentUrl": utils.URLToString(item.URL),
+ "type": "asset",
+ })).Info("URL archived")
+ }
+ } else if ina.IsURL(req) {
+ playerURLs := ina.ExtractPlayerURLs(doc, c.Client)
- // Attempt to get a new base value from the base HTML tag
- link, exists := goitem.Attr("href")
- if exists {
- baseTagValue, err := url.Parse(link)
+ for _, playerURL := range playerURLs {
+ playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false)
if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing base tag value")
+ c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from player URL")
} else {
- base = baseTagValue
+ c.Capture(playerItem)
}
}
- })
- }
+ }
- // Extract outlinks
- outlinks, err := c.extractOutlinks(base, doc)
- if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks")
- return err
- }
+ // Websites can use a tag to specify a base for relative URLs in every other tags.
+ // This checks for the "base" tag and resets the "base" URL variable with the new base URL specified
+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
+ if !utils.StringInSlice("base", c.DisabledHTMLTags) {
+ oldBase := base
- waitGroup.Add(1)
- go c.queueOutlinks(outlinks, item, &waitGroup)
+ doc.Find("base").Each(func(index int, goitem *goquery.Selection) {
+ // If a new base got scraped, stop looking for one
+ if oldBase != base {
+ return
+ }
- if c.DisableAssetsCapture {
- return err
- }
+ // Attempt to get a new base value from the base HTML tag
+ link, exists := goitem.Attr("href")
+ if exists {
+ baseTagValue, err := url.Parse(link)
+ if err != nil {
+ c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing base tag value")
+ } else {
+ base = baseTagValue
+ }
+ }
+ })
+ }
- // Extract and capture assets (only if we didn't use an extractor that produce assets)
- if len(assets) == 0 {
- assets, err = c.extractAssets(base, item, doc)
+ // Extract outlinks
+ outlinks, err = c.extractOutlinks(base, doc)
if err != nil {
- c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
+ c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks")
return err
}
+
+ if !c.DisableAssetsCapture {
+ assets, err = c.extractAssets(base, item, doc)
+ if err != nil {
+ c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
+ return err
+ }
+ }
}
- if len(assets) != 0 {
+ waitGroup.Add(1)
+ go c.queueOutlinks(outlinks, item, &waitGroup)
+
+ if !c.DisableAssetsCapture && len(assets) != 0 {
assets = c.seencheckAssets(assets, item)
if len(assets) != 0 {
c.captureAssets(item, assets, resp.Cookies(), nil)
diff --git a/internal/pkg/crawl/extractor/s3.go b/internal/pkg/crawl/extractor/s3.go
new file mode 100644
index 00000000..1df9fa3f
--- /dev/null
+++ b/internal/pkg/crawl/extractor/s3.go
@@ -0,0 +1,124 @@
+package extractor
+
+import (
+ "encoding/xml"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+
+ "github.com/internetarchive/Zeno/internal/pkg/utils"
+)
+
+var validS3Servers = []string{
+ "AmazonS3",
+ "WasabiS3",
+ "UploadServer", // Google Cloud Storage
+ "Windows-Azure-Blob",
+ "AliyunOSS", // Alibaba Object Storage Service
+}
+
+// S3ListBucketResult represents the XML structure of an S3 bucket listing
+type S3ListBucketResult struct {
+ XMLName xml.Name `xml:"ListBucketResult"`
+ Name string `xml:"Name"`
+ Prefix string `xml:"Prefix"`
+ Marker string `xml:"Marker"`
+ Contents []S3Object `xml:"Contents"`
+ CommonPrefixes []CommonPrefix `xml:"CommonPrefixes"`
+ IsTruncated bool `xml:"IsTruncated"`
+ NextContinuationToken string `xml:"NextContinuationToken"`
+}
+
+type S3Object struct {
+ Key string `xml:"Key"`
+ LastModified string `xml:"LastModified"`
+ Size int64 `xml:"Size"`
+}
+
+type CommonPrefix struct {
+ Prefix string `xml:"Prefix"`
+}
+
+// IsS3 checks if the response is from an S3 server
+func IsS3(resp *http.Response) bool {
+ return utils.StringContainsSliceElements(resp.Header.Get("Server"), validS3Servers)
+}
+
+// S3 takes an initial response and returns URLs of either files or prefixes at the current level,
+// plus continuation URL if more results exist
+func S3(resp *http.Response) ([]*url.URL, error) {
+ result, err := S3ProcessResponse(resp)
+ if err != nil {
+ return nil, err
+ }
+
+ // Extract base URL from the response URL
+ reqURL := resp.Request.URL
+ requestQuery := reqURL.Query()
+ baseURL := fmt.Sprintf("https://%s", reqURL.Host)
+ parsedBase, err := url.Parse(baseURL)
+ if err != nil {
+ return nil, fmt.Errorf("invalid base URL: %v", err)
+ }
+
+ var urls []string
+
+ // Ensure we can add marker
+ // ListObjects
+ if requestQuery.Get("list-type") != "2" && len(result.Contents) > 0 {
+ // If we can, iterate through S3 using the marker field
+ nextURL := *reqURL
+ q := nextURL.Query()
+ q.Set("marker", result.Contents[len(result.Contents)-1].Key)
+ nextURL.RawQuery = q.Encode()
+ urls = append(urls, nextURL.String())
+ }
+
+ // If we are using list-type 2/ListObjectsV2
+ if len(result.CommonPrefixes) > 0 {
+ for _, prefix := range result.CommonPrefixes {
+ nextURL := *reqURL
+ q := nextURL.Query()
+ q.Set("prefix", prefix.Prefix)
+ nextURL.RawQuery = q.Encode()
+ urls = append(urls, nextURL.String())
+ }
+ } else {
+ // Otherwise return file URLs
+ for _, obj := range result.Contents {
+ if obj.Size > 0 {
+ fileURL := *parsedBase
+ fileURL.Path += "/" + obj.Key
+ urls = append(urls, fileURL.String())
+ }
+ }
+ }
+
+ // If there's a continuation token, add the continuation URL
+ if result.IsTruncated && result.NextContinuationToken != "" {
+ nextURL := *reqURL
+ q := nextURL.Query()
+ q.Set("continuation-token", result.NextContinuationToken)
+ nextURL.RawQuery = q.Encode()
+ urls = append(urls, nextURL.String())
+ }
+
+ return utils.StringSliceToURLSlice(urls), nil
+}
+
+// S3ProcessResponse parses an HTTP response into an S3ListBucketResult
+func S3ProcessResponse(resp *http.Response) (*S3ListBucketResult, error) {
+ body, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return nil, fmt.Errorf("error reading response body: %v", err)
+ }
+ defer resp.Body.Close()
+
+ var result S3ListBucketResult
+ if err := xml.Unmarshal(body, &result); err != nil {
+ return nil, fmt.Errorf("error parsing XML: %v", err)
+ }
+
+ return &result, nil
+}