diff --git a/go.mod b/go.mod index 2858bab0..aff40ddd 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,6 @@ module github.com/internetarchive/Zeno go 1.22.4 require ( - github.com/internetarchive/gocrawlhq v1.2.14 github.com/CorentinB/warc v0.8.53 github.com/PuerkitoBio/goquery v1.9.3 github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 @@ -14,6 +13,7 @@ require ( github.com/gosuri/uilive v0.0.4 github.com/gosuri/uitable v0.0.4 github.com/grafov/m3u8 v0.12.0 + github.com/internetarchive/gocrawlhq v1.2.14 github.com/paulbellamy/ratecounter v0.2.0 github.com/philippgille/gokv/leveldb v0.7.0 github.com/prometheus/client_golang v1.20.4 @@ -32,6 +32,7 @@ require ( require ( github.com/andybalholm/brotli v1.1.0 // indirect github.com/andybalholm/cascadia v1.3.2 // indirect + github.com/aws/aws-sdk-go v1.55.5 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cloudflare/circl v1.4.0 // indirect @@ -49,6 +50,7 @@ require ( github.com/google/go-cmp v0.6.0 // indirect github.com/hashicorp/hcl v1.0.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/klauspost/compress v1.17.10 // indirect github.com/magiconair/properties v1.8.7 // indirect diff --git a/go.sum b/go.sum index 954b78f1..3ee6b96a 100644 --- a/go.sum +++ b/go.sum @@ -16,6 +16,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= +github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU= +github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= @@ -82,6 +84,9 @@ github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGM github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM= github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s= github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc= +github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= +github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= +github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0= @@ -264,6 +269,7 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/pkg/crawl/capture.go b/internal/pkg/crawl/capture.go index cfefd00a..0d3be9c4 100644 --- a/internal/pkg/crawl/capture.go +++ b/internal/pkg/crawl/capture.go @@ -457,16 +457,25 @@ func (c *Crawl) Capture(item *queue.Item) error { } // If the response is an XML document, we want to scrape it for links + var outlinks []*url.URL if strings.Contains(resp.Header.Get("Content-Type"), "xml") { - URLsFromXML, isSitemap, err := extractor.XML(resp) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML") + if extractor.IsS3(resp) { + URLsFromS3, err := extractor.S3(resp) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting URLs from S3") + } + + outlinks = append(outlinks, URLsFromS3...) } else { - if isSitemap { - waitGroup.Add(1) - go c.queueOutlinks(URLsFromXML, item, &waitGroup) + URLsFromXML, isSitemap, err := extractor.XML(resp) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML") } else { - assets = append(assets, URLsFromXML...) + if isSitemap { + outlinks = append(outlinks, URLsFromXML...) + } else { + assets = append(assets, URLsFromXML...) + } } } } else if strings.Contains(resp.Header.Get("Content-Type"), "json") { @@ -488,111 +497,106 @@ func (c *Crawl) Capture(item *queue.Item) error { } return err - } - - // Turn the response into a doc that we will scrape for outlinks and assets. - doc, err := goquery.NewDocumentFromReader(resp.Body) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while creating goquery document") - return err - } - - // Execute site-specific code on the document - if cloudflarestream.IsURL(base.Host) { - // Look for JS files necessary for the playback of the video - cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client) + } else { + // Turn the response into a doc that we will scrape for outlinks and assets. + doc, err := goquery.NewDocumentFromReader(resp.Body) if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting JS files from cloudflarestream") + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while creating goquery document") return err } - // Seencheck the URLs we captured, we ignore the returned value here - // because we already archived the URLs, we just want them to be added - // to the seencheck table. - if c.UseSeencheck { - if c.UseHQ { - _, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs)) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{ - "urls": cfstreamURLs, - })).Error("error while seenchecking assets via HQ") - } - } else { - for _, cfstreamURL := range cfstreamURLs { - c.Seencheck.SeencheckURL(cfstreamURL, "asset") - } - } - } - // Log the archived URLs - for _, cfstreamURL := range cfstreamURLs { - c.Log.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{ - "parentHop": item.Hop, - "parentUrl": utils.URLToString(item.URL), - "type": "asset", - })).Info("URL archived") - } - } else if ina.IsURL(req) { - playerURLs := ina.ExtractPlayerURLs(doc, c.Client) - - for _, playerURL := range playerURLs { - playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false) + // Execute site-specific code on the document + if cloudflarestream.IsURL(utils.URLToString(item.URL)) { + // Look for JS files necessary for the playback of the video + cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client) if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from player URL") - } else { - c.Capture(playerItem) + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting JS files from cloudflarestream") + return err } - } - } - - // Websites can use a tag to specify a base for relative URLs in every other tags. - // This checks for the "base" tag and resets the "base" URL variable with the new base URL specified - // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base - if !utils.StringInSlice("base", c.DisabledHTMLTags) { - oldBase := base - doc.Find("base").Each(func(index int, goitem *goquery.Selection) { - // If a new base got scraped, stop looking for one - if oldBase != base { - return + // Seencheck the URLs we captured, we ignore the returned value here + // because we already archived the URLs, we just want them to be added + // to the seencheck table. + if c.UseSeencheck { + if c.UseHQ { + _, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs)) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{ + "urls": cfstreamURLs, + })).Error("error while seenchecking assets via HQ") + } + } else { + for _, cfstreamURL := range cfstreamURLs { + c.Seencheck.SeencheckURL(cfstreamURL, "asset") + } + } } + // Log the archived URLs + for _, cfstreamURL := range cfstreamURLs { + c.Log.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{ + "parentHop": item.Hop, + "parentUrl": utils.URLToString(item.URL), + "type": "asset", + })).Info("URL archived") + } + } else if ina.IsURL(req) { + playerURLs := ina.ExtractPlayerURLs(doc, c.Client) - // Attempt to get a new base value from the base HTML tag - link, exists := goitem.Attr("href") - if exists { - baseTagValue, err := url.Parse(link) + for _, playerURL := range playerURLs { + playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false) if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing base tag value") + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from player URL") } else { - base = baseTagValue + c.Capture(playerItem) } } - }) - } + } - // Extract outlinks - outlinks, err := c.extractOutlinks(base, doc) - if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks") - return err - } + // Websites can use a tag to specify a base for relative URLs in every other tags. + // This checks for the "base" tag and resets the "base" URL variable with the new base URL specified + // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base + if !utils.StringInSlice("base", c.DisabledHTMLTags) { + oldBase := base - waitGroup.Add(1) - go c.queueOutlinks(outlinks, item, &waitGroup) + doc.Find("base").Each(func(index int, goitem *goquery.Selection) { + // If a new base got scraped, stop looking for one + if oldBase != base { + return + } - if c.DisableAssetsCapture { - return err - } + // Attempt to get a new base value from the base HTML tag + link, exists := goitem.Attr("href") + if exists { + baseTagValue, err := url.Parse(link) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing base tag value") + } else { + base = baseTagValue + } + } + }) + } - // Extract and capture assets (only if we didn't use an extractor that produce assets) - if len(assets) == 0 { - assets, err = c.extractAssets(base, item, doc) + // Extract outlinks + outlinks, err = c.extractOutlinks(base, doc) if err != nil { - c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets") + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks") return err } + + if !c.DisableAssetsCapture { + assets, err = c.extractAssets(base, item, doc) + if err != nil { + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets") + return err + } + } } - if len(assets) != 0 { + waitGroup.Add(1) + go c.queueOutlinks(outlinks, item, &waitGroup) + + if !c.DisableAssetsCapture && len(assets) != 0 { assets = c.seencheckAssets(assets, item) if len(assets) != 0 { c.captureAssets(item, assets, resp.Cookies(), nil) diff --git a/internal/pkg/crawl/extractor/s3.go b/internal/pkg/crawl/extractor/s3.go new file mode 100644 index 00000000..1df9fa3f --- /dev/null +++ b/internal/pkg/crawl/extractor/s3.go @@ -0,0 +1,124 @@ +package extractor + +import ( + "encoding/xml" + "fmt" + "io" + "net/http" + "net/url" + + "github.com/internetarchive/Zeno/internal/pkg/utils" +) + +var validS3Servers = []string{ + "AmazonS3", + "WasabiS3", + "UploadServer", // Google Cloud Storage + "Windows-Azure-Blob", + "AliyunOSS", // Alibaba Object Storage Service +} + +// S3ListBucketResult represents the XML structure of an S3 bucket listing +type S3ListBucketResult struct { + XMLName xml.Name `xml:"ListBucketResult"` + Name string `xml:"Name"` + Prefix string `xml:"Prefix"` + Marker string `xml:"Marker"` + Contents []S3Object `xml:"Contents"` + CommonPrefixes []CommonPrefix `xml:"CommonPrefixes"` + IsTruncated bool `xml:"IsTruncated"` + NextContinuationToken string `xml:"NextContinuationToken"` +} + +type S3Object struct { + Key string `xml:"Key"` + LastModified string `xml:"LastModified"` + Size int64 `xml:"Size"` +} + +type CommonPrefix struct { + Prefix string `xml:"Prefix"` +} + +// IsS3 checks if the response is from an S3 server +func IsS3(resp *http.Response) bool { + return utils.StringContainsSliceElements(resp.Header.Get("Server"), validS3Servers) +} + +// S3 takes an initial response and returns URLs of either files or prefixes at the current level, +// plus continuation URL if more results exist +func S3(resp *http.Response) ([]*url.URL, error) { + result, err := S3ProcessResponse(resp) + if err != nil { + return nil, err + } + + // Extract base URL from the response URL + reqURL := resp.Request.URL + requestQuery := reqURL.Query() + baseURL := fmt.Sprintf("https://%s", reqURL.Host) + parsedBase, err := url.Parse(baseURL) + if err != nil { + return nil, fmt.Errorf("invalid base URL: %v", err) + } + + var urls []string + + // Ensure we can add marker + // ListObjects + if requestQuery.Get("list-type") != "2" && len(result.Contents) > 0 { + // If we can, iterate through S3 using the marker field + nextURL := *reqURL + q := nextURL.Query() + q.Set("marker", result.Contents[len(result.Contents)-1].Key) + nextURL.RawQuery = q.Encode() + urls = append(urls, nextURL.String()) + } + + // If we are using list-type 2/ListObjectsV2 + if len(result.CommonPrefixes) > 0 { + for _, prefix := range result.CommonPrefixes { + nextURL := *reqURL + q := nextURL.Query() + q.Set("prefix", prefix.Prefix) + nextURL.RawQuery = q.Encode() + urls = append(urls, nextURL.String()) + } + } else { + // Otherwise return file URLs + for _, obj := range result.Contents { + if obj.Size > 0 { + fileURL := *parsedBase + fileURL.Path += "/" + obj.Key + urls = append(urls, fileURL.String()) + } + } + } + + // If there's a continuation token, add the continuation URL + if result.IsTruncated && result.NextContinuationToken != "" { + nextURL := *reqURL + q := nextURL.Query() + q.Set("continuation-token", result.NextContinuationToken) + nextURL.RawQuery = q.Encode() + urls = append(urls, nextURL.String()) + } + + return utils.StringSliceToURLSlice(urls), nil +} + +// S3ProcessResponse parses an HTTP response into an S3ListBucketResult +func S3ProcessResponse(resp *http.Response) (*S3ListBucketResult, error) { + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + defer resp.Body.Close() + + var result S3ListBucketResult + if err := xml.Unmarshal(body, &result); err != nil { + return nil, fmt.Errorf("error parsing XML: %v", err) + } + + return &result, nil +}