Skip to content

Commit 033f746

Browse files
authored
Feature/cli scraper (#347)
2 parents 8413614 + 58132ce commit 033f746

84 files changed

Lines changed: 4386 additions & 2963 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

apps/cli/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
dist
2+
node_modules
3+
.turbo
4+
*.tsbuildinfo
5+
cache/

apps/cli/eslint.config.mjs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
import { config } from "@sneu/eslint-config/base";
2+
3+
/** @type {import("eslint").Linter.Config} */
4+
export default [...config];

apps/cli/package.json

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
{
2+
"name": "@sneu/cli",
3+
"version": "0.0.1",
4+
"private": true,
5+
"type": "module",
6+
"scripts": {
7+
"build": "tsc",
8+
"lint": "eslint .",
9+
"cli": "node --env-file .env --import tsx ./src/cli.ts"
10+
},
11+
"dependencies": {
12+
"@clack/prompts": "^1.1.0",
13+
"@sneu/db": "workspace:*",
14+
"@sneu/scraper": "workspace:*",
15+
"citty": "^0.2.0",
16+
"drizzle-orm": "^0.45.1",
17+
"picocolors": "^1.1.1",
18+
"yaml": "^2.8.2",
19+
"zod": "^4.3.6"
20+
},
21+
"devDependencies": {
22+
"@sneu/eslint-config": "workspace:*",
23+
"@sneu/tsconfig": "workspace:*",
24+
"@types/node": "^25.5.0",
25+
"eslint": "^9.39.2",
26+
"tsx": "^4.21.0",
27+
"typescript": "^5.9.3"
28+
}
29+
}

apps/cli/src/cli.ts

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
/**
2+
* SearchNEU Scraper CLI — single entry point.
3+
*
4+
* Usage:
5+
* pnpm run cli generate [options]
6+
* pnpm run cli upload [options]
7+
* pnpm run cli tools <subcommand> [options]
8+
*/
9+
10+
import { defineCommand, runMain } from "citty";
11+
12+
const tools = defineCommand({
13+
meta: {
14+
name: "tools",
15+
description: "config management and validation tools",
16+
},
17+
subCommands: {
18+
"seed-config": import("./tools/seed-config").then((m) => m.default),
19+
"expire-terms": import("./tools/expire-terms").then((m) => m.default),
20+
"check-config": import("./tools/check-config").then((m) => m.default),
21+
"sync-db": import("./tools/sync-db").then((m) => m.default),
22+
"update-cache": import("./tools/update-cache").then((m) => m.default),
23+
},
24+
});
25+
26+
const main = defineCommand({
27+
meta: {
28+
name: "cli",
29+
description: "SearchNEU CLI",
30+
},
31+
subCommands: {
32+
generate: () => import("./commands/generate").then((m) => m.default),
33+
upload: () => import("./commands/upload").then((m) => m.default),
34+
tools,
35+
},
36+
});
37+
38+
void runMain(main);
Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
import { existsSync, readFileSync, writeFileSync } from "node:fs";
22
import path from "node:path";
33
import { parse } from "yaml";
4-
import { defineCommand, runMain } from "citty";
5-
import { scrapeCatalogTerm } from "../generate/main";
4+
import { defineCommand } from "citty";
5+
import { scrapeCatalogTerm } from "@sneu/scraper/generate";
66
import { infer as zinfer } from "zod";
7-
import { Config } from "../config";
8-
import { consola } from "consola";
9-
import { ScraperBannerCache } from "../schemas/scraper/banner-cache";
7+
import { Config } from "@sneu/scraper/config";
8+
import { ScraperBannerCache } from "@sneu/scraper/schemas/banner-cache";
9+
import { ScraperEventEmitter } from "@sneu/scraper/events";
10+
import { brandIntro, p, pc, setVerbosity } from "../ui";
11+
import { attachLogger } from "../logger";
1012

1113
const CACHE_FORMAT = (term: string) => `term-${term}.json`;
12-
const CACHE_VERSION = 3;
14+
const CACHE_VERSION = 5;
1315

14-
const main = defineCommand({
16+
export default defineCommand({
1517
meta: {
16-
name: "scrape:gen",
18+
name: "generate",
1719
description: "runs the scraper to generate the banner cache files",
1820
},
1921
args: {
@@ -26,8 +28,15 @@ const main = defineCommand({
2628
},
2729
cachePath: {
2830
type: "string",
29-
default: "cache/",
30-
description: "",
31+
default: process.env.SCRAPER_CACHE_PATH ?? "cache/",
32+
description: "path to cache directory (env: SCRAPER_CACHE_PATH)",
33+
required: false,
34+
},
35+
configPath: {
36+
type: "string",
37+
default: process.env.SCRAPER_CONFIG_PATH ?? "config/",
38+
description:
39+
"path to config directory containing manifest.yaml (env: SCRAPER_CONFIG_PATH)",
3140
required: false,
3241
},
3342
interactive: {
@@ -56,59 +65,64 @@ const main = defineCommand({
5665
},
5766
},
5867
async run({ args }) {
59-
if (args.verbose) consola.level = 4;
60-
if (args.veryverbose) consola.level = 999;
68+
// const interactive = args.interactive ?? false;
69+
// setVerbosity({ verbose: args.verbose, veryVerbose: args.veryverbose });
70+
setVerbosity({ verbose: true, veryVerbose: false });
71+
// updateSettings({ withGuide: false });
72+
brandIntro("generate");
6173

62-
const interactive = args.interactive ?? false;
74+
const emitter = new ScraperEventEmitter();
75+
attachLogger(emitter, { interactive: true });
6376

6477
const configStream = readFileSync(
65-
path.resolve(args.cachePath, "manifest.yaml"),
66-
{
67-
encoding: "utf8",
68-
},
78+
path.resolve(args.configPath, "manifest.yaml"),
79+
{ encoding: "utf8" },
6980
);
7081
const configRaw = parse(configStream);
7182
const configResponse = Config.safeParse(configRaw);
7283
if (!configResponse.success) {
73-
consola.error(configResponse.error);
84+
p.log.error(pc.red(String(configResponse.error)));
85+
p.cancel("Invalid config");
7486
return;
7587
}
7688

7789
const config = configResponse.data;
78-
7990
const termsToScrape = filterTerms(config, args.terms);
80-
consola.info(`scraping ${termsToScrape.length} terms`);
91+
92+
p.log.info(
93+
`Scraping ${pc.bold(String(termsToScrape.length))} term${termsToScrape.length !== 1 ? "s" : ""}`,
94+
);
8195

8296
if (termsToScrape.length === 0) {
83-
consola.log("no active / configured terms to scrape");
97+
p.outro("No active terms to scrape");
8498
return;
8599
}
86100

87101
for (const termConfig of termsToScrape) {
88-
consola.start(`scraping term ${termConfig.term}`);
89-
90102
const cachename = path.resolve(
91103
args.cachePath,
92104
CACHE_FORMAT(termConfig.term.toString()),
93105
);
94106
const existingCache = existsSync(cachename);
95107
if (args.overwrite && existingCache) {
96-
consola.info("existing cache found, overwriting with new scrape");
108+
p.log.info(
109+
`Existing cache for ${pc.cyan(String(termConfig.term))}, overwriting`,
110+
);
97111
} else if (!args.overwrite && existingCache) {
98-
consola.success("existing cache found, skipping term");
112+
p.log.success(
113+
`Cache exists for ${pc.cyan(String(termConfig.term))}, skipping`,
114+
);
99115
continue;
100116
}
101117

102118
try {
103119
const out = await scrapeCatalogTerm(
104120
termConfig.term.toString(),
105-
termConfig,
106-
interactive,
121+
emitter,
107122
);
108123

109124
if (!out) {
110-
consola.error(`error scraping term ${termConfig.term}`);
111-
// return;
125+
p.log.error(pc.red(`Failed to scrape term ${termConfig.term}`));
112126
continue;
113127
}
114128

@@ -119,23 +133,18 @@ const main = defineCommand({
119133
};
120134

121135
writeFileSync(cachename, JSON.stringify(cachedData, null, 2));
122-
consola.success(`scraped term ${termConfig.term}`);
123136
} catch (e) {
124-
consola.error(`failed to scrape term ${termConfig.term}`, e);
137+
p.log.error(pc.red(`Failed to scrape term ${termConfig.term}: ${e}`));
125138
continue;
126139
}
127140
}
128141

129-
consola.success(
130-
`successfully scraped ${termsToScrape.length} term${termsToScrape.length > 1 ? "s" : ""}`,
142+
p.outro(
143+
`Scraped ${pc.bold(String(termsToScrape.length))} term${termsToScrape.length > 1 ? "s" : ""} — cache is fresh`,
131144
);
132145
},
133146
});
134147

135-
void runMain(main);
136-
137-
/**
138-
*/
139148
function filterTerms(config: zinfer<typeof Config>, termArg: string) {
140149
if (termArg === "all") {
141150
return config.terms;
@@ -151,7 +160,9 @@ function filterTerms(config: zinfer<typeof Config>, termArg: string) {
151160
splitTerms.includes(t.term.toString()),
152161
);
153162
if (filteredTerms.length === 0) {
154-
consola.error(`no matching terms found for: ${splitTerms.join(", ")}`);
163+
p.log.error(
164+
pc.red(`No matching terms found for: ${splitTerms.join(", ")}`),
165+
);
155166
process.exit(1);
156167
}
157168
return filteredTerms;

0 commit comments

Comments
 (0)