This repository has been archived by the owner on Mar 8, 2024. It is now read-only.
This is sqlite dump of the artifacts/
directory. You will find a table with two columns: the JSON path, and the JSON blob. You can use the json_*
sqlite functions to query data.
The file is uploaded in three parts that need to be recombined once downloaded. The result should be a .tar.zst
file, that needs to be extracted:
$ cat libcfgraph.tar.zst.1 libcfgraph.tar.zst.2 libcfgraph.tar.zst.3 > libcfgraph.tar.zst
$ tar -xf libcfgraph.tar.zst
Python code used to generate the sqlite
dump
The
query()
function shows how to fetch data from the database.
"""
Bootstrap an artifact-to-json-blob sqlite database
using regro/libcfgraph's JSON artifacts.
The script expects the path to the artifacts/ directory as an argument.
This is a one-time operation that takes around 10 minutes.
"""
import sqlite3
import sys
import time
from itertools import batched
from pathlib import Path
from tqdm.auto import tqdm
def create_db():
db = sqlite3.connect("libcfgraph.db", isolation_level=None)
db.executescript(
"""
CREATE TABLE IF NOT EXISTS ArtifactToJson (
artifact TEXT PRIMARY KEY,
data TEXT
);
PRAGMA journal_mode = OFF;
PRAGMA synchronous = 0;
PRAGMA cache_size = 1000000;
PRAGMA locking_mode = EXCLUSIVE;
PRAGMA temp_store = MEMORY;
"""
)
return db
def bootstrap_from_libcfgraph_artifact_to_json(db, artifacts_dir):
def iterator():
for path in Path(artifacts_dir).glob("**/*.json"):
try:
text = path.read_bytes()
except Exception as e:
print(f"Error reading {path}: {e}")
continue
artifact = "/".join([*path.parts[-3:-1], path.stem])
yield (artifact, text)
db = create_db()
for batch in tqdm(batched(iterator(), 10000), total=1602023 // 10000):
db.executemany(
"""
INSERT OR IGNORE INTO ArtifactToJson (artifact, data)
VALUES (?, json(?))
""",
batch,
)
return db
def query(db, q):
t0 = time.time()
for row in db.execute(
"""
SELECT artifact
FROM ArtifactToJson, json_each(data, "$.files")
WHERE json_each.value = (?)
LIMIT 10
""",
(q,),
):
print("-", row)
print(f"Query took {time.time() - t0:.2f} seconds")
if __name__ == "__main__":
db = create_db()
bootstrap_from_libcfgraph_artifact_to_json(db, sys.argv[1])
db.commit()
db.close()