|
| 1 | +import chai from 'chai'; |
| 2 | +const assert = chai.assert; |
| 3 | +import config from 'config'; |
| 4 | +import API from '../../api3.js'; |
| 5 | +import Helpers from '../../helpers3.js'; |
| 6 | +import shared from "../shared.js"; |
| 7 | +import { S3Client, DeleteObjectsCommand } from "@aws-sdk/client-s3"; |
| 8 | +import fs from 'fs'; |
| 9 | +import HTTP from '../../httpHandler.js'; |
| 10 | +import { localInvoke } from '../../full-text-extractor/src/local_invoke.mjs'; |
| 11 | + |
| 12 | + |
| 13 | +describe('FileTestTests', function () { |
| 14 | + this.timeout(0); |
| 15 | + let toDelete = []; |
| 16 | + const s3Client = new S3Client({ region: "us-east-1" }); |
| 17 | + |
| 18 | + before(async function () { |
| 19 | + await shared.API3Before(); |
| 20 | + try { |
| 21 | + fs.mkdirSync("./work"); |
| 22 | + } |
| 23 | + catch {} |
| 24 | + }); |
| 25 | + |
| 26 | + after(async function () { |
| 27 | + await shared.API3After(); |
| 28 | + fs.rm("./work", { recursive: true, force: true }, (e) => { |
| 29 | + if (e) console.log(e); |
| 30 | + }); |
| 31 | + if (toDelete.length > 0) { |
| 32 | + const commandInput = { |
| 33 | + Bucket: config.s3Bucket, |
| 34 | + Delete: { |
| 35 | + Objects: toDelete.map((x) => { |
| 36 | + return { Key: x }; |
| 37 | + }) |
| 38 | + } |
| 39 | + }; |
| 40 | + const command = new DeleteObjectsCommand(commandInput); |
| 41 | + await s3Client.send(command); |
| 42 | + } |
| 43 | + }); |
| 44 | + |
| 45 | + beforeEach(async () => { |
| 46 | + API.useAPIKey(config.apiKey); |
| 47 | + }); |
| 48 | + |
| 49 | + it('should_extract_pdf_text', async function () { |
| 50 | + let json = await API.createItem("book", false, this, 'json'); |
| 51 | + assert.equal(0, json.meta.numChildren); |
| 52 | + let parentKey = json.key; |
| 53 | + |
| 54 | + json = await API.createAttachmentItem("imported_file", [], parentKey, this, 'json'); |
| 55 | + let attachmentKey = json.key; |
| 56 | + let version = json.version; |
| 57 | + |
| 58 | + let filename = "dummy.pdf"; |
| 59 | + let mtime = Date.now(); |
| 60 | + const pdfText = makeRandomPDF(); |
| 61 | + |
| 62 | + let fileContents = fs.readFileSync("./work/dummy.pdf"); |
| 63 | + let size = Buffer.from(fileContents.toString()).byteLength; |
| 64 | + let md5 = Helpers.md5(fileContents.toString()); |
| 65 | + |
| 66 | + // Create attachment item |
| 67 | + let response = await API.userPost( |
| 68 | + config.userID, |
| 69 | + "items", |
| 70 | + JSON.stringify([ |
| 71 | + { |
| 72 | + key: attachmentKey, |
| 73 | + contentType: "application/pdf", |
| 74 | + } |
| 75 | + ]), |
| 76 | + { |
| 77 | + "Content-Type": "application/json", |
| 78 | + "If-Unmodified-Since-Version": version |
| 79 | + } |
| 80 | + ); |
| 81 | + Helpers.assert200ForObject(response); |
| 82 | + |
| 83 | + // Get upload authorization |
| 84 | + response = await API.userPost( |
| 85 | + config.userID, |
| 86 | + "items/" + attachmentKey + "/file", |
| 87 | + Helpers.implodeParams({ |
| 88 | + md5: md5, |
| 89 | + mtime: mtime, |
| 90 | + filename: filename, |
| 91 | + filesize: size |
| 92 | + }), |
| 93 | + { |
| 94 | + "Content-Type": "application/x-www-form-urlencoded", |
| 95 | + "If-None-Match": "*" |
| 96 | + } |
| 97 | + ); |
| 98 | + Helpers.assert200(response); |
| 99 | + json = API.getJSONFromResponse(response); |
| 100 | + |
| 101 | + // Upload |
| 102 | + response = await HTTP.post( |
| 103 | + json.url, |
| 104 | + json.prefix + fileContents + json.suffix, |
| 105 | + { |
| 106 | + "Content-Type": json.contentType |
| 107 | + } |
| 108 | + ); |
| 109 | + Helpers.assert201(response); |
| 110 | + |
| 111 | + // Post-upload file registration |
| 112 | + response = await API.userPost( |
| 113 | + config.userID, |
| 114 | + "items/" + attachmentKey + "/file", |
| 115 | + "upload=" + json.uploadKey, |
| 116 | + { |
| 117 | + "Content-Type": "application/x-www-form-urlencoded", |
| 118 | + "If-None-Match": "*" |
| 119 | + } |
| 120 | + ); |
| 121 | + Helpers.assert204(response); |
| 122 | + |
| 123 | + toDelete.push(md5); |
| 124 | + |
| 125 | + // Local invoke full-text-extractor |
| 126 | + await localInvoke(); |
| 127 | + |
| 128 | + // Get full text to ensure full-text-extractor worked |
| 129 | + response = await API.userGet( |
| 130 | + config.userID, |
| 131 | + "items/" + attachmentKey + "/fulltext", |
| 132 | + ); |
| 133 | + Helpers.assert200(response); |
| 134 | + const data = JSON.parse(response.data); |
| 135 | + assert.property(data, 'content'); |
| 136 | + assert.equal(data.content.trim(), pdfText); |
| 137 | + }); |
| 138 | + |
| 139 | + const makeRandomPDF = () => { |
| 140 | + const randomText = Helpers.uniqueToken(); |
| 141 | + const pdfData = `%PDF-1.4 |
| 142 | +1 0 obj <</Type /Catalog /Pages 2 0 R>> |
| 143 | +endobj |
| 144 | +2 0 obj <</Type /Pages /Kids [3 0 R] /Count 1>> |
| 145 | +endobj |
| 146 | +3 0 obj<</Type /Page /Parent 2 0 R /Resources 4 0 R /MediaBox [0 0 500 800] /Contents 6 0 R>> |
| 147 | +endobj |
| 148 | +4 0 obj<</Font <</F1 5 0 R>>>> |
| 149 | +endobj |
| 150 | +5 0 obj<</Type /Font /Subtype /Type1 /BaseFont /Helvetica>> |
| 151 | +endobj |
| 152 | +6 0 obj |
| 153 | +<</Length 44>> |
| 154 | +stream |
| 155 | +BT /F1 24 Tf 175 720 Td (${randomText})Tj ET |
| 156 | +endstream |
| 157 | +endobj |
| 158 | +xref |
| 159 | +0 7 |
| 160 | +0000000000 65535 f |
| 161 | +0000000009 00000 n |
| 162 | +0000000056 00000 n |
| 163 | +0000000111 00000 n |
| 164 | +0000000212 00000 n |
| 165 | +0000000250 00000 n |
| 166 | +0000000317 00000 n |
| 167 | +trailer <</Size 7/Root 1 0 R>> |
| 168 | +startxref |
| 169 | +406 |
| 170 | +%%EOF`; |
| 171 | + fs.writeFileSync(`./work/dummy.pdf`, pdfData); |
| 172 | + return randomText; |
| 173 | + }; |
| 174 | +}); |
| 175 | + |
| 176 | + |
0 commit comments