Skip to content

Commit 5e21351

Browse files
committed
initial pdf text extraction setup
1 parent 0d653ed commit 5e21351

File tree

3 files changed

+180
-0
lines changed

3 files changed

+180
-0
lines changed

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
11
[submodule "htdocs/zotero-schema"]
22
path = htdocs/zotero-schema
33
url = https://github.com/zotero/zotero-schema.git
4+
[submodule "tests/remote_js/full-text-extractor"]
5+
path = tests/remote_js/full-text-extractor
6+
url = https://github.com/zotero/full-text-extractor.git
Submodule full-text-extractor added at 00932c4
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
import chai from 'chai';
2+
const assert = chai.assert;
3+
import config from 'config';
4+
import API from '../../api3.js';
5+
import Helpers from '../../helpers3.js';
6+
import shared from "../shared.js";
7+
import { S3Client, DeleteObjectsCommand } from "@aws-sdk/client-s3";
8+
import fs from 'fs';
9+
import HTTP from '../../httpHandler.js';
10+
import { localInvoke } from '../../full-text-extractor/src/local_invoke.mjs';
11+
12+
13+
describe('FileTestTests', function () {
14+
this.timeout(0);
15+
let toDelete = [];
16+
const s3Client = new S3Client({ region: "us-east-1" });
17+
18+
before(async function () {
19+
await shared.API3Before();
20+
try {
21+
fs.mkdirSync("./work");
22+
}
23+
catch {}
24+
});
25+
26+
after(async function () {
27+
await shared.API3After();
28+
fs.rm("./work", { recursive: true, force: true }, (e) => {
29+
if (e) console.log(e);
30+
});
31+
if (toDelete.length > 0) {
32+
const commandInput = {
33+
Bucket: config.s3Bucket,
34+
Delete: {
35+
Objects: toDelete.map((x) => {
36+
return { Key: x };
37+
})
38+
}
39+
};
40+
const command = new DeleteObjectsCommand(commandInput);
41+
await s3Client.send(command);
42+
}
43+
});
44+
45+
beforeEach(async () => {
46+
API.useAPIKey(config.apiKey);
47+
});
48+
49+
it('should_extract_pdf_text', async function () {
50+
let json = await API.createItem("book", false, this, 'json');
51+
assert.equal(0, json.meta.numChildren);
52+
let parentKey = json.key;
53+
54+
json = await API.createAttachmentItem("imported_file", [], parentKey, this, 'json');
55+
let attachmentKey = json.key;
56+
let version = json.version;
57+
58+
let filename = "dummy.pdf";
59+
let mtime = Date.now();
60+
const pdfText = makeRandomPDF();
61+
62+
let fileContents = fs.readFileSync("./work/dummy.pdf");
63+
let size = Buffer.from(fileContents.toString()).byteLength;
64+
let md5 = Helpers.md5(fileContents.toString());
65+
66+
// Create attachment item
67+
let response = await API.userPost(
68+
config.userID,
69+
"items",
70+
JSON.stringify([
71+
{
72+
key: attachmentKey,
73+
contentType: "application/pdf",
74+
}
75+
]),
76+
{
77+
"Content-Type": "application/json",
78+
"If-Unmodified-Since-Version": version
79+
}
80+
);
81+
Helpers.assert200ForObject(response);
82+
83+
// Get upload authorization
84+
response = await API.userPost(
85+
config.userID,
86+
"items/" + attachmentKey + "/file",
87+
Helpers.implodeParams({
88+
md5: md5,
89+
mtime: mtime,
90+
filename: filename,
91+
filesize: size
92+
}),
93+
{
94+
"Content-Type": "application/x-www-form-urlencoded",
95+
"If-None-Match": "*"
96+
}
97+
);
98+
Helpers.assert200(response);
99+
json = API.getJSONFromResponse(response);
100+
101+
// Upload
102+
response = await HTTP.post(
103+
json.url,
104+
json.prefix + fileContents + json.suffix,
105+
{
106+
"Content-Type": json.contentType
107+
}
108+
);
109+
Helpers.assert201(response);
110+
111+
// Post-upload file registration
112+
response = await API.userPost(
113+
config.userID,
114+
"items/" + attachmentKey + "/file",
115+
"upload=" + json.uploadKey,
116+
{
117+
"Content-Type": "application/x-www-form-urlencoded",
118+
"If-None-Match": "*"
119+
}
120+
);
121+
Helpers.assert204(response);
122+
123+
toDelete.push(md5);
124+
125+
// Local invoke full-text-extractor
126+
await localInvoke();
127+
128+
// Get full text to ensure full-text-extractor worked
129+
response = await API.userGet(
130+
config.userID,
131+
"items/" + attachmentKey + "/fulltext",
132+
);
133+
Helpers.assert200(response);
134+
const data = JSON.parse(response.data);
135+
assert.property(data, 'content');
136+
assert.equal(data.content.trim(), pdfText);
137+
});
138+
139+
const makeRandomPDF = () => {
140+
const randomText = Helpers.uniqueToken();
141+
const pdfData = `%PDF-1.4
142+
1 0 obj <</Type /Catalog /Pages 2 0 R>>
143+
endobj
144+
2 0 obj <</Type /Pages /Kids [3 0 R] /Count 1>>
145+
endobj
146+
3 0 obj<</Type /Page /Parent 2 0 R /Resources 4 0 R /MediaBox [0 0 500 800] /Contents 6 0 R>>
147+
endobj
148+
4 0 obj<</Font <</F1 5 0 R>>>>
149+
endobj
150+
5 0 obj<</Type /Font /Subtype /Type1 /BaseFont /Helvetica>>
151+
endobj
152+
6 0 obj
153+
<</Length 44>>
154+
stream
155+
BT /F1 24 Tf 175 720 Td (${randomText})Tj ET
156+
endstream
157+
endobj
158+
xref
159+
0 7
160+
0000000000 65535 f
161+
0000000009 00000 n
162+
0000000056 00000 n
163+
0000000111 00000 n
164+
0000000212 00000 n
165+
0000000250 00000 n
166+
0000000317 00000 n
167+
trailer <</Size 7/Root 1 0 R>>
168+
startxref
169+
406
170+
%%EOF`;
171+
fs.writeFileSync(`./work/dummy.pdf`, pdfData);
172+
return randomText;
173+
};
174+
});
175+
176+

0 commit comments

Comments
 (0)