Skip to content

Commit cd538ab

Browse files
committed
refactor: optimize jwpub parsing
1 parent 4696012 commit cd538ab

File tree

7 files changed

+253
-120
lines changed

7 files changed

+253
-120
lines changed

app/pages/translate/outlines/index.vue

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ const importOutlines = async () => {
5353
try {
5454
const body = new FormData();
5555
body.append("file", jwpubFile.value);
56-
const parsedOutlines = await $fetch<Outline[]>("/api/outlines", {
56+
const parsedOutlines = await $fetch<Outline[]>("/api/outlines/stream", {
5757
body,
5858
method: "POST",
5959
});

app/pages/translate/outlines/new.vue

Lines changed: 0 additions & 93 deletions
This file was deleted.

server/api/outlines/buffer.post.ts

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
export default defineEventHandler(async (event) => {
2+
console.log("Getting outlines...");
3+
const [file] = await receiveFiles(event, {
4+
ensure: {
5+
maxSize: "128MB",
6+
types: ["application/octet-stream"],
7+
},
8+
formKey: "file",
9+
multiple: false,
10+
});
11+
12+
if (!file) {
13+
throw createError({
14+
statusCode: 400,
15+
statusMessage: "No file received",
16+
});
17+
}
18+
19+
console.log("File received");
20+
21+
const database = await getJWPUBDatabaseFromBuffer(await file.arrayBuffer());
22+
23+
return await getOutlinesFromJWPUB(database);
24+
});

server/api/outlines/stream.post.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
export default defineEventHandler(async (event) => {
2+
console.log("Getting outlines...");
3+
const database = await processFileUpload(event, {
4+
maxSize: 256 * 1024 * 1024,
5+
processor: getJWPUBDatabaseFromStream,
6+
});
7+
8+
return await getOutlinesFromJWPUB(database);
9+
});

server/utils/blob.ts

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import type {
2+
BlobEnsureOptions,
3+
BlobSize,
4+
BlobUploadOptions,
5+
FileSizeUnit,
6+
} from "#server/types/blob";
7+
import type { H3Event } from "h3";
8+
9+
import { defu } from "defu";
10+
11+
// Credits from shared utils of https://github.com/pingdotgg/uploadthing
12+
const FILESIZE_UNITS = ["B", "KB", "MB", "GB"];
13+
14+
/**
15+
* Utility to receive a file or files from body's FormData without storing it.
16+
*
17+
* @throws
18+
* If the files are invalid or don't meet the ensure conditions.
19+
*/
20+
export async function receiveFiles(
21+
event: H3Event,
22+
options: BlobUploadOptions = {},
23+
) {
24+
options = defu(options, {
25+
formKey: "files",
26+
multiple: true,
27+
} satisfies BlobUploadOptions);
28+
29+
try {
30+
const form = await readFormData(event);
31+
const files = form.getAll(options.formKey!) as File[];
32+
33+
if (!files?.length)
34+
throw createError({ message: "No files received", status: 400 });
35+
36+
if (!options.multiple && files.length > 1)
37+
throw createError({
38+
message: "Multiple files are not allowed",
39+
status: 400,
40+
});
41+
42+
if (typeof options.multiple === "number" && files.length > options.multiple)
43+
throw createError({
44+
message: `Number of files exceeded. Maximum allowed: ${options.multiple}`,
45+
status: 400,
46+
});
47+
48+
if (options.ensure?.maxSize || options.ensure?.types?.length) {
49+
for (const file of files) {
50+
ensureBlob(file, options.ensure);
51+
}
52+
}
53+
return files;
54+
} catch (e) {
55+
console.error(e);
56+
throw createError({
57+
cause: e,
58+
message: "Error receiving files",
59+
statusCode: 500,
60+
});
61+
}
62+
}
63+
64+
/**
65+
* Ensure the blob is valid and meets the specified requirements.
66+
*
67+
* @param blob The blob to check
68+
* @param options The options to check against
69+
* @param options.maxSize The maximum size of the blob (e.g. '1MB')
70+
* @param options.types The allowed types of the blob (e.g. ['image/png', 'application/json', 'video'])
71+
*
72+
* @throws If the blob does not meet the requirements
73+
*/
74+
function ensureBlob(blob: Blob, options: BlobEnsureOptions = {}) {
75+
if (!(blob instanceof Blob)) {
76+
throw createError({
77+
message: "Received invalid file",
78+
statusCode: 400,
79+
});
80+
}
81+
82+
if (options.maxSize) {
83+
const maxFileSizeBytes = fileSizeToBytes(options.maxSize);
84+
85+
if (blob.size > maxFileSizeBytes) {
86+
throw createError({
87+
message: `File too heavy. Max size is: ${options.maxSize}`,
88+
statusCode: 400,
89+
});
90+
}
91+
}
92+
93+
const [blobType, blobSubtype] = blob.type.split("/");
94+
95+
if (
96+
options.types?.length &&
97+
!options.types?.includes(blob.type) &&
98+
!options.types?.includes(blobType ?? "") &&
99+
!options.types?.includes(blobSubtype ?? "")
100+
) {
101+
throw createError({
102+
message: `Invalid file type: ${blob.type}. Only allowed: ${options.types.join(", ")}`,
103+
statusCode: 400,
104+
});
105+
}
106+
}
107+
108+
/**
109+
* Helper function that converts any valid BlobSize into numeric bytes value
110+
*
111+
* @example "1MB", "1500B", "1.2GB"
112+
*
113+
* @throws If the input is not a valid BlobSize
114+
*/
115+
function fileSizeToBytes(input: BlobSize) {
116+
// eslint-disable-next-line security/detect-non-literal-regexp
117+
const regex = new RegExp(
118+
`^(\\d+)(\\.\\d+)?\\s*(${FILESIZE_UNITS.join("|")})$`,
119+
"i",
120+
);
121+
const match = input.match(regex);
122+
123+
if (!match) {
124+
throw createError({
125+
message: `Invalid file size format: ${input}`,
126+
statusCode: 500,
127+
});
128+
}
129+
130+
const sizeValue = Number.parseFloat(match[1]!);
131+
const sizeUnit = match[3]!.toUpperCase() as FileSizeUnit;
132+
133+
if (!FILESIZE_UNITS.includes(sizeUnit)) {
134+
throw createError({
135+
message: `Invalid file size unit: ${sizeUnit}`,
136+
statusCode: 500,
137+
});
138+
}
139+
140+
const bytes = sizeValue * Math.pow(1024, FILESIZE_UNITS.indexOf(sizeUnit));
141+
return Math.floor(bytes);
142+
}

server/utils/jwpub.ts

Lines changed: 70 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ import unzipper from "unzipper";
1212
* @param stream The readable stream of the .jwpub file.
1313
* @returns The loaded database.
1414
*/
15-
export const getJWPUBDatabase = async (stream: Readable): Promise<Database> => {
15+
export const getJWPUBDatabaseFromStream = async (
16+
stream: Readable,
17+
): Promise<Database> => {
1618
try {
1719
const dbBuffer = await pipeline(
1820
stream,
@@ -39,8 +41,49 @@ export const getJWPUBDatabase = async (stream: Readable): Promise<Database> => {
3941
}
4042
};
4143

42-
export const parseJWPUB = async (db: Database) => {
43-
const htmlDocs = await getHTMLDocs(db);
44+
export const getJWPUBDatabaseFromBuffer = async (
45+
buffer: ArrayBuffer,
46+
): Promise<Database> => {
47+
try {
48+
const outerZip = await extractZipFiles(buffer);
49+
if (!outerZip.files["contents"]) {
50+
throw createError({
51+
message: "No contents file found in the JWPUB file",
52+
status: 400,
53+
});
54+
}
55+
56+
const innerZip = await extractZipFiles(
57+
await outerZip.files["contents"]!.async("uint8array"),
58+
);
59+
60+
const dbFile = Object.keys(innerZip.files).find((file) =>
61+
file.endsWith(".db"),
62+
);
63+
if (!dbFile)
64+
throw createError({
65+
message: "No database file found in the JWPUB file",
66+
status: 400,
67+
});
68+
69+
const sqlDb = await innerZip.files[dbFile]!.async("uint8array");
70+
71+
return loadDatabase(sqlDb);
72+
} catch (e) {
73+
console.error(e);
74+
throw createError({
75+
cause: e,
76+
message: "Failed to get database from .jwpub file",
77+
status: 500,
78+
});
79+
}
80+
};
81+
82+
export const parseOutlines = async (
83+
db: Database,
84+
outlines: { Content: BufferSource }[],
85+
) => {
86+
const htmlDocs = await getDocsFromOutlines(db, outlines);
4487
return htmlDocs.map((htmlDoc) => {
4588
// "Nr. {nr} {title}"
4689
let header = htmlDoc.querySelector("header > h1 strong")?.textContent;
@@ -264,22 +307,39 @@ const getDocs = async (db: Database, key: string, iv: string) => {
264307

265308
for (const row of data.at(0)!.values) {
266309
const content = row.at(0) as BufferSource;
267-
const text = await getRawContent(content, key, iv);
268-
const htmlDoc = parseHTML(text);
269-
270-
htmlDoc.querySelectorAll("rt").forEach((rt) => rt.remove());
271-
272-
files.push(htmlDoc);
310+
files.push(await getDoc(content, key, iv));
273311
}
274312

275313
return files;
276314
};
277315

278-
const getHTMLDocs = async (db: Database) => {
316+
const getDoc = async (content: BufferSource, key: string, iv: string) => {
317+
const text = await getRawContent(content, key, iv);
318+
const htmlDoc = parseHTML(text);
319+
320+
htmlDoc.querySelectorAll("rt").forEach((rt) => rt.remove());
321+
322+
return htmlDoc;
323+
};
324+
325+
export const getHTMLDocs = async (db: Database) => {
279326
const pubCard = getPubCard(db);
280327

281328
const { iv, key } = await getPubKeyIv(pubCard);
282329

283330
const files = await getDocs(db, key, iv);
284331
return files;
285332
};
333+
334+
const getDocsFromOutlines = async (
335+
db: Database,
336+
contents: { Content: BufferSource }[],
337+
) => {
338+
const pubCard = getPubCard(db);
339+
340+
const { iv, key } = await getPubKeyIv(pubCard);
341+
342+
return await Promise.all(
343+
contents.map((content) => getDoc(content.Content, key, iv)),
344+
);
345+
};

0 commit comments

Comments
 (0)