Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Externalize pypi map into a ro sqlite database #116

Merged
merged 12 commits into from
Jul 24, 2023
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ internal/backends/python/regression_tests/test

# nix build creates a "result" symlink to the nix store output
result
internal/backends/python/pypi_map.sqlite
play
10 changes: 5 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
SOURCES := $(shell find cmd internal -type d -o -name "*.go")
RESOURCES := $(shell find resources)
GENERATED := internal/backends/python/pypi_map.gen.go
GENERATED := internal/backends/python/pypi_map.sqlite
LD_FLAGS := "-X 'github.com/replit/upm/internal/cli.version=$${VERSION:-development version}'"

export GO111MODULE=on
Expand All @@ -11,8 +11,8 @@ upm: cmd/upm/upm ## Build the UPM binary
install: cmd/upm/upm
go install ./cmd/upm

internal/backends/python/pypi_map.gen.go: internal/backends/python/download_stats.json
go generate ./internal/backends/python
internal/backends/python/pypi_map.sqlite: internal/backends/python/download_stats.json
cd internal/backends/python; go run ./gen_pypi_map -bq download_stats.json -pkg python -out pypi_map.sqlite -cache cache -cmd gen

.PHONY: generated
generated: internal/statik/statik.go $(GENERATED)
Expand All @@ -27,7 +27,7 @@ internal/statik/statik.go: $(shell find resources -type f)
go run github.com/rakyll/statik -src resources -dest internal -f

clean-gen:
rm $(GENERATED)
rm -f $(GENERATED)
rm -rf internal/statik

.PHONY: dev
Expand Down Expand Up @@ -70,7 +70,7 @@ pkgbuild: ## Update and test PKGBUILD
/tmp/update-pkgbuild.bash

.PHONY: clean
clean: ## Remove build artifacts
clean: clean-gen ## Remove build artifacts
rm -rf cmd/upm/upm dist

.PHONY: help
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ require (
github.com/googleapis/enterprise-certificate-proxy v0.2.0 // indirect
github.com/googleapis/gax-go/v2 v2.7.0 // indirect
github.com/inconshreveable/mousetrap v1.0.0 // indirect
github.com/mattn/go-sqlite3 v1.14.17 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/spf13/pflag v1.0.3 // indirect
go.opencensus.io v0.24.0 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,8 @@ github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/lyft/protoc-gen-star v0.6.0/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA=
github.com/lyft/protoc-gen-star v0.6.1/go.mod h1:TGAoBVkt8w7MPG72TrKIu85MIdXwDuzJYeZuUPFPNwA=
github.com/magiconair/properties v1.8.0/go.mod h1:PppfXfuXeibc/6YijjN8zIbojt8czPbwD3XqdrwzmxQ=
github.com/mattn/go-sqlite3 v1.14.17 h1:mCRHCLDUBXgpKAqIKsaAaAsrAlbkeomtRFKXh2L6YIM=
github.com/mattn/go-sqlite3 v1.14.17/go.mod h1:2eHXhiwb8IkHr+BDWZGa96P6+rkvnG63S2DGjv9HUNg=
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
github.com/natefinch/atomic v0.0.0-20150920032501-a62ce929ffcc h1:7xGrl4tTpBQu5Zjll08WupHyq+Sp0Z/adtyf1cfk3Q8=
Expand Down
134 changes: 0 additions & 134 deletions internal/backends/python/gen_pypi_map/code_gen.go

This file was deleted.

165 changes: 165 additions & 0 deletions internal/backends/python/gen_pypi_map/db_gen.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
package main

import (
"database/sql"
"encoding/json"
"errors"
"fmt"
"os"
"strings"

_ "github.com/mattn/go-sqlite3"
)

func GenerateDB(pkg string, outputFilePath string, cache map[string]PackageInfo, bqFilePath string, pkgsLegacyFile string) error {
downloadStats, err := LoadDownloadStats(bqFilePath)
if err != nil {
return err
}

legacyPypiPackages := loadLegacyPypyPackages(pkgsLegacyFile)

packagesProcessed := make(map[string]bool)
var moduleToPackageList = map[string][]PackageInfo{}

for _, info := range cache {
pkgName := strings.ToLower(info.Name)
if info.Error != "" {
// fallback to legacy package module info
legacyInfo, ok := legacyPypiPackages[pkgName]
if ok {
info.Modules = legacyInfo.Mods
}
}
packagesProcessed[pkgName] = true
for _, module := range info.Modules {
moduleToPackageList[module] = append(moduleToPackageList[module], info)
}
}

// Backfill legacy package info that is missing from our cache
for pkg, legacyInfo := range legacyPypiPackages {
_, ok := packagesProcessed[pkg]
if ok {
continue
}
var info PackageInfo
info.Name = legacyInfo.Pkg
info.Modules = legacyInfo.Mods

for _, module := range info.Modules {
moduleToPackageList[module] = append(moduleToPackageList[module], info)
}
}

fmt.Printf("Loaded %d modules\n", len(moduleToPackageList))

err = os.Remove(outputFilePath)
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
db, err := sql.Open("sqlite3", outputFilePath)
if err != nil {
return err
}
_, err = db.Exec(`
create table module_to_pypi_package (module_name text primary key, guess text, reason text);
create table pypi_packages (package_name text primary key, module_list text, downloads int);
create index downloads_index on pypi_packages (downloads);
`)
if err != nil {
return err
}

// Write all data within one transaction for speed
// https://stackoverflow.com/questions/1711631/improve-insert-per-second-performance-of-sqlite
_, err = db.Exec(`begin transaction;`)
if err != nil {
return err
}

// Guess at every module, add the guess and the package that was guessed to
// the masp
airportyh marked this conversation as resolved.
Show resolved Hide resolved
for moduleName, candidates := range moduleToPackageList {
if guess, reason, guessable := GuessPackage(moduleName, candidates, downloadStats); guessable {
stmt, err := db.Prepare("insert into module_to_pypi_package values (?, ?, ?);")
if err != nil {
return err
}
_, err = stmt.Exec(moduleName, guess.Name, reason)
if err != nil {
return err
}
stmt.Close()

stmt, err = db.Prepare(`
insert into pypi_packages values (?, ?, ?)
on conflict (package_name)
do update set
module_list = excluded.module_list,
downloads = excluded.downloads;
`)
if err != nil {
return err
}
download, ok := downloadStats[normalizePackageName(guess.Name)]
if !ok {
download = 0
}
_, err = stmt.Exec(guess.Name, strings.Join(guess.Modules, ","), download)
if err != nil {
return fmt.Errorf("%s on %s", err.Error(), guess.Name)
}
stmt.Close()
}
}

_, err = db.Exec(`end transaction;`)
if err != nil {
return err
}

err = db.Close()
if err != nil {
return err
}

// Make it read only
err = os.Chmod(outputFilePath, 0444)
if err != nil {
return err
}

fmt.Printf("Wrote %s\n", outputFilePath)
return nil

}

func loadLegacyPypyPackages(filePath string) map[string]LegacyPackageInfo {
injson, err := os.Open(filePath)
if err != nil {
return make(map[string]LegacyPackageInfo)
}
infoMap := make(map[string]LegacyPackageInfo)

dec := json.NewDecoder(injson)
for dec.More() {
var info LegacyPackageInfo

err = dec.Decode(&info)
if err != nil {
continue
}
info.Pkg = strings.ToLower(info.Pkg)
infoMap[info.Pkg] = info
}

return infoMap
}

func normalizePackageName(name string) string {
nameStr := string(name)
nameStr = strings.ToLower(nameStr)
nameStr = strings.Replace(nameStr, "_", "-", -1)
return nameStr
}
4 changes: 2 additions & 2 deletions internal/backends/python/gen_pypi_map/gen_pypi_map.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ func main() {
distMods := flag.Bool("dist", false, "Determine modules by examining dists")
force := flag.Bool("force", false, "Force re-test when cached")
pkg := flag.String("pkg", "python", "the pkg name for the output source")
out := flag.String("out", "pypi_map.gen.go", "the destination file for the generated code")
out := flag.String("out", "pypi_map.sqlite", "the destination file for the generated data")
flag.Parse()

if *command == "bq" {
Expand Down Expand Up @@ -73,7 +73,7 @@ func main() {
Parameters: pkg, out, cachedfr, cachefile, bq, pypipackages
*/
cache := LoadAllPackageInfo(*cache, *pkgsFile)
err := GenerateCode(*pkg, *out, cache, *bq, *pkgsLegacyFile)
err := GenerateDB(*pkg, *out, cache, *bq, *pkgsLegacyFile)
if err != nil {
fmt.Fprintf(os.Stderr, "Failed to generate %s: %s", *out, err.Error())
}
Expand Down
Loading