Skip to content

Commit

Permalink
Tokenize all strings in the metadata document, and only strings.
Browse files Browse the repository at this point in the history
This greatly simplifies the API and user expectations, by eliminating
the need to manually specify the tokenizable fields. It is also less
surprising as  the search for stringy fields now behaves consistently.
  • Loading branch information
LTLA committed Feb 20, 2024
1 parent f45cb57 commit 3fceb66
Show file tree
Hide file tree
Showing 19 changed files with 94 additions and 263 deletions.
54 changes: 0 additions & 54 deletions scripts/configure.js

This file was deleted.

4 changes: 2 additions & 2 deletions scripts/fresh.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ const args = parseArgs({
});

const dir = utils.required(args, "dir");
const { db_paths, db_tokenizable } = utils.parseConfigurations(utils.required(args, "config"), dir);
const db_paths = utils.parseConfigurations(utils.required(args, "config"), dir);
const { list_projects, list_assets, list_versions, find_latest, read_summary, read_metadata } = utils.chooseSourceFunctions(utils.optional(args, "registry"), utils.optional(args, "gypsum"));

// Creating the timestamp here, just so that if there are any operations
Expand All @@ -34,4 +34,4 @@ const { list_projects, list_assets, list_versions, find_latest, read_summary, re
// just (re)aligning with whatever's in the bucket.
fs.writeFileSync(path.join(dir, "modified"), String((new Date).getTime()))

await freshHandler(db_paths, list_projects, list_assets, list_versions, find_latest, read_summary, read_metadata, db_tokenizable);
await freshHandler(db_paths, list_projects, list_assets, list_versions, find_latest, read_summary, read_metadata);
3 changes: 1 addition & 2 deletions scripts/manual.js
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ const args = parseArgs({
}
});

const { db_paths, db_tokenizable } = utils.parseConfigurations(utils.required(args, "config"), utils.required(args, "dir"));
const db_paths = utils.parseConfigurations(utils.required(args, "config"), utils.required(args, "dir"));
const { list_projects, list_assets, list_versions, find_latest, read_summary, read_metadata } = utils.chooseSourceFunctions(utils.optional(args, "registry"), utils.optional(args, "gypsum"));

await manualHandler(
Expand All @@ -46,5 +46,4 @@ await manualHandler(
find_latest,
read_summary,
read_metadata,
db_tokenizable
);
4 changes: 2 additions & 2 deletions scripts/update.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ const args = parseArgs({
});

const dir = utils.required(args, "dir");
const { db_paths, db_tokenizable } = utils.parseConfigurations(utils.required(args, "config"), dir);
const db_paths = utils.parseConfigurations(utils.required(args, "config"), dir);
const { list_logs, read_log, read_metadata, find_latest } = utils.chooseSourceFunctions(utils.optional(args, "registry"), utils.optional(args, "gypsum"));

let lastmod_path = path.join(dir, "modified");
let lastmod = new Date(Number(fs.readFileSync(lastmod_path)));
let all_logs = await updateHandler(db_paths, lastmod, list_logs, read_log, read_metadata, find_latest, db_tokenizable);
let all_logs = await updateHandler(db_paths, lastmod, list_logs, read_log, read_metadata, find_latest);

// Storing the timestamp of the last processed job.
if (all_logs.length) {
Expand Down
8 changes: 3 additions & 5 deletions scripts/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,11 @@ import * as path from "path";

export function parseConfigurations(configs, dir) {
const db_paths = {};
const db_tokenizable = {};
for (const cpath of configs) {
let config = JSON.parse(fs.readFileSync(cpath, { encoding: "utf8" }));
db_paths[config.file_name] = path.join(dir, config.db_name);
db_tokenizable[config.file_name] = new Set(config.tokenizable);
let i = cpath.indexOf(",");
db_paths[cpath.slice(0, i)] = path.join(dir, cpath.slice(i + 1));
}
return { db_paths, db_tokenizable };
return db_paths;
}

export function chooseSourceFunctions(registry, gypsum_url) {
Expand Down
16 changes: 8 additions & 8 deletions src/handlers/freshHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { addVersion } from "../sqlite/addVersion.js";
import { createTables } from "../sqlite/createTables.js";
import Database from "better-sqlite3"

export async function freshHandler(db_paths, list_projects, list_assets, list_versions, find_latest, read_summary, read_metadata, db_tokenizable) {
export async function freshHandler(db_paths, list_projects, list_assets, list_versions, find_latest, read_summary, read_metadata) {
const db_handles = {};
for (const [k, v] of Object.entries(db_paths)) {
if (fs.existsSync(v)) {
Expand All @@ -17,7 +17,7 @@ export async function freshHandler(db_paths, list_projects, list_assets, list_ve
const all_projects = await list_projects();
let all_outcomes = [];
for (const project of all_projects) {
let projprom = internal_freshProject(db_handles, project, list_assets, list_versions, find_latest, read_summary, read_metadata, db_tokenizable);
let projprom = internal_freshProject(db_handles, project, list_assets, list_versions, find_latest, read_summary, read_metadata);
all_outcomes.push(projprom);
}

Expand All @@ -32,11 +32,11 @@ export async function freshHandler(db_paths, list_projects, list_assets, list_ve
}

// Only exported for the purpose of re-use in manualHandler.js.
export async function internal_freshProject(db_handles, project, list_assets, list_versions, find_latest, read_summary, read_metadata, db_tokenizable) {
export async function internal_freshProject(db_handles, project, list_assets, list_versions, find_latest, read_summary, read_metadata) {
const all_assets = await list_assets(project);
let all_outcomes = [];
for (const asset of all_assets) {
let assprom = internal_freshAsset(db_handles, project, asset, list_versions, find_latest, read_summary, read_metadata, db_tokenizable);
let assprom = internal_freshAsset(db_handles, project, asset, list_versions, find_latest, read_summary, read_metadata);
all_outcomes.push(assprom);
}

Expand All @@ -49,7 +49,7 @@ export async function internal_freshProject(db_handles, project, list_assets, li
}
}

export async function internal_freshAsset(db_handles, project, asset, list_versions, find_latest, read_summary, read_metadata, db_tokenizable) {
export async function internal_freshAsset(db_handles, project, asset, list_versions, find_latest, read_summary, read_metadata) {
const latest = await find_latest(project, asset);
if (latest == null) { // short-circuit if latest=null, as that means that there are no non-probational versions.
return;
Expand All @@ -58,7 +58,7 @@ export async function internal_freshAsset(db_handles, project, asset, list_versi
const all_versions = await list_versions(project, asset);
let all_outcomes = [];
for (const version of all_versions) {
let verprom = internal_freshVersion(db_handles, project, asset, version, latest, read_summary, read_metadata, db_tokenizable);
let verprom = internal_freshVersion(db_handles, project, asset, version, latest, read_summary, read_metadata);
all_outcomes.push(verprom);
}

Expand All @@ -71,15 +71,15 @@ export async function internal_freshAsset(db_handles, project, asset, list_versi
}
}

export async function internal_freshVersion(db_handles, project, asset, version, latest, read_summary, read_metadata, db_tokenizable) {
export async function internal_freshVersion(db_handles, project, asset, version, latest, read_summary, read_metadata) {
const summ = await read_summary(project, asset, version);
if ("on_probation" in summ && summ.on_probation) {
return;
}
const output = await read_metadata(project, asset, version, Object.keys(db_handles));
for (const [e, db] of Object.entries(db_handles)) {
try {
addVersion(db, project, asset, version, (latest == version), output[e], db_tokenizable[e]);
addVersion(db, project, asset, version, (latest == version), output[e]);
} catch (err) {
throw new Error("failed to add to database '" + e + "'", { cause: err });
}
Expand Down
8 changes: 4 additions & 4 deletions src/handlers/manualHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import { deleteProject } from "../sqlite/deleteProject.js";
import * as fresh from "./freshHandler.js";
import Database from "better-sqlite3"

export async function manualHandler(db_paths, project, asset, version, list_assets, list_versions, find_latest, read_summary, read_metadata, db_tokenizable) {
export async function manualHandler(db_paths, project, asset, version, list_assets, list_versions, find_latest, read_summary, read_metadata) {
const db_handles = {};
for (const [k, v] of Object.entries(db_paths)) {
db_handles[k] = Database(v);
Expand All @@ -14,21 +14,21 @@ export async function manualHandler(db_paths, project, asset, version, list_asse
for (const db of Object.values(db_handles)) {
deleteProject(db, project);
}
await fresh.internal_freshProject(db_handles, project, list_assets, list_versions, find_latest, read_summary, read_metadata, db_tokenizable);
await fresh.internal_freshProject(db_handles, project, list_assets, list_versions, find_latest, read_summary, read_metadata);

} else if (version == null) {
for (const db of Object.values(db_handles)) {
deleteAsset(db, project, asset);
}
await fresh.internal_freshAsset(db_handles, project, asset, list_versions, find_latest, read_summary, read_metadata, db_tokenizable);
await fresh.internal_freshAsset(db_handles, project, asset, list_versions, find_latest, read_summary, read_metadata);

} else {
for (const db of Object.values(db_handles)) {
deleteVersion(db, project, asset, version);
}
const latest = find_latest(project, asset);
if (latest != null) { // short-circuit if latest = null, as this implies that there are no (non-probational) versions.
await fresh.internal_freshVersion(db_handles, project, asset, version, latest, read_summary, read_metadata, db_tokenizable);
await fresh.internal_freshVersion(db_handles, project, asset, version, latest, read_summary, read_metadata);
}
}
}
4 changes: 2 additions & 2 deletions src/handlers/updateHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export async function readLogs(last_modified, list_logs, read_log) {
return logs;
}

export async function updateHandler(db_paths, last_modified, list_logs, read_log, read_metadata, find_latest, db_tokenizable) {
export async function updateHandler(db_paths, last_modified, list_logs, read_log, read_metadata, find_latest) {
const db_handles = {};
for (const [k, v] of Object.entries(db_paths)) {
db_handles[k] = Database(v);
Expand All @@ -104,7 +104,7 @@ export async function updateHandler(db_paths, last_modified, list_logs, read_log
const version = safe_extract(parameters, "version");
let output = await read_metadata(project, asset, version, to_extract);
for (const [e, db] of Object.entries(db_handles)) {
addVersion(db, project, asset, version, is_latest(parameters), output[e], db_tokenizable[e]);
addVersion(db, project, asset, version, is_latest(parameters), output[e]);
}

} else if (type == "delete-version") {
Expand Down
14 changes: 6 additions & 8 deletions src/sqlite/addVersion.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { splitIntoTokens } from "./splitIntoTokens.js";

export function addVersion(db, project, asset, version, latest, metadata, tokenizable) {
export function addVersion(db, project, asset, version, latest, metadata) {
const trans = db.transaction(() => {
db.prepare("DELETE FROM versions WHERE project = ? AND asset = ? AND VERSION = ?").run(project, asset, version);
if (latest) {
Expand All @@ -22,32 +22,30 @@ export function addVersion(db, project, asset, version, latest, metadata, tokeni
for (const [p, m] of Object.entries(metadata)) {
let pinfo = db.prepare("INSERT INTO paths(vid, path, metadata) VALUES(?, ?, jsonb(?)) RETURNING pid").get(vid, p, JSON.stringify(m));
let pid = pinfo.pid;
traverse_metadata(db, pid, m, null, insert_token, tokenizable);
traverse_metadata(db, pid, m, null, insert_token);
}
});

trans();
return;
}

function traverse_metadata(db, pid, metadata, property, insert_token, tokenizable) {
function traverse_metadata(db, pid, metadata, property, insert_token) {
if (metadata instanceof Array) {
for (const v of metadata) {
traverse_metadata(db, pid, v, property, insert_token, tokenizable);
traverse_metadata(db, pid, v, property, insert_token);
}
} else if (metadata instanceof Object) {
for (const [k, v] of Object.entries(metadata)) {
let newname = (property == null ? k : property + "." + k);
traverse_metadata(db, pid, v, newname, insert_token, tokenizable);
traverse_metadata(db, pid, v, newname, insert_token);
}
} else {
if (typeof metadata == "string" && tokenizable.has(property)) {
if (typeof metadata == "string") {
let tokens = splitIntoTokens(metadata);
for (const t of tokens) {
insert_token(pid, property, t);
}
} else {
insert_token(pid, property, String(metadata));
}
}
}
14 changes: 4 additions & 10 deletions tests/handlers/freshHandler.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,8 @@ import Database from "better-sqlite3";
test("freshHandler works correctly without probation", async () => {
const testdir = utils.setupTestDirectory("freshHandler");
let all_paths = {};
let all_tokenizable = {};
for (const p of [ "_meta", "_alt" ]) {
all_paths[p] = path.join(testdir, "test" + p + ".sqlite3")
all_tokenizable[p] = new Set(["description", "motto"]);
}

await freshHandler(
Expand Down Expand Up @@ -54,8 +52,7 @@ test("freshHandler works correctly without probation", async () => {
"_alt": { "thingy.csv": utils.mockMetadata["macrophage"] }
}
}
},
all_tokenizable
}
);

// Check that all versions are added, along with their metadata entries.
Expand All @@ -69,7 +66,7 @@ test("freshHandler works correctly without probation", async () => {
expect(vpayload.map(x => x.version)).toEqual(["bar1", "bar2", "v1", "v1"]);
expect(vpayload.map(x => x.latest)).toEqual([0, 1, 1, 1]);

let tpayload = utils.scanForToken(db, 'Donato');
let tpayload = utils.scanForToken(db, 'donato');
if (x == "_meta") {
expect(tpayload.length).toBeGreaterThan(0);
} else {
Expand All @@ -90,10 +87,8 @@ test("freshHandler works correctly without probation", async () => {
test("freshHandler works correctly with probation", async () => {
const testdir = utils.setupTestDirectory("freshHandler");
let all_paths = {};
let all_tokenizable = {};
for (const p of [ "_meta", "_alt" ]) {
all_paths[p] = path.join(testdir, "test" + p + ".sqlite3")
all_tokenizable[p] = new Set(["description", "motto"]);
}

await freshHandler(
Expand Down Expand Up @@ -132,8 +127,7 @@ test("freshHandler works correctly with probation", async () => {
"_meta": { "AAA.json": utils.mockMetadata["marcille"] },
"_alt": { "BBB/CCC.txt": utils.mockMetadata["chicken"] }
}
},
all_tokenizable
}
);

// Check that all versions are added, along with their metadata entries.
Expand All @@ -147,7 +141,7 @@ test("freshHandler works correctly with probation", async () => {
expect(vpayload[0].version).toBe("bar1");
expect(vpayload[0].latest).toBe(1);

let tpayload = utils.scanForToken(db, 'Donato');
let tpayload = utils.scanForToken(db, 'donato');
if (x == "_meta") {
expect(tpayload.length).toBeGreaterThan(0);
} else {
Expand Down
10 changes: 4 additions & 6 deletions tests/handlers/manualHandler.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,12 @@ import Database from "better-sqlite3";
test("manualHandler works correctly", async () => {
const testdir = utils.setupTestDirectory("manualHandler");
let all_paths = {};
let all_tokenizable = {};
for (const p of [ "_meta", "_alt" ]) {
let opath = path.join(testdir, "test" + p + ".sqlite3")
let db = Database(opath);
createTables(db);
db.close();
all_paths[p] = opath;
all_tokenizable[p] = new Set(["description", "motto"]);
}

// Set up the various functions.
Expand Down Expand Up @@ -60,7 +58,7 @@ test("manualHandler works correctly", async () => {
};

// Refreshing a single version.
await manualHandler(all_paths, "test", "foo", "bar1", listAssets, listVersions, findLatest, readSummary, readMetadata, all_tokenizable);
await manualHandler(all_paths, "test", "foo", "bar1", listAssets, listVersions, findLatest, readSummary, readMetadata);

for (const [x, p] of Object.entries(all_paths)) {
const db = Database(p);
Expand All @@ -72,7 +70,7 @@ test("manualHandler works correctly", async () => {
expect(vpayload[0].version).toEqual("bar1");
expect(vpayload[0].latest).toEqual(0);

let tpayload = utils.scanForToken(db, 'Donato');
let tpayload = utils.scanForToken(db, 'donato');
if (x == "_meta") {
expect(tpayload.length).toBeGreaterThan(0);
} else {
Expand All @@ -90,7 +88,7 @@ test("manualHandler works correctly", async () => {
}

// Refreshing a single asset.
await manualHandler(all_paths, "test", "foo", null, listAssets, listVersions, findLatest, readSummary, readMetadata, all_tokenizable);
await manualHandler(all_paths, "test", "foo", null, listAssets, listVersions, findLatest, readSummary, readMetadata);

for (const [x, p] of Object.entries(all_paths)) {
const db = Database(p);
Expand All @@ -110,7 +108,7 @@ test("manualHandler works correctly", async () => {
}

// Refreshing a single project.
await manualHandler(all_paths, "test", null, null, listAssets, listVersions, findLatest, readSummary, readMetadata, all_tokenizable);
await manualHandler(all_paths, "test", null, null, listAssets, listVersions, findLatest, readSummary, readMetadata);

for (const [x, p] of Object.entries(all_paths)) {
const db = Database(p);
Expand Down
Loading

0 comments on commit 3fceb66

Please sign in to comment.