Open
Description
Describe the bug
The text_as_html response is corrupted by the presence of // in tables
To Reproduce
The following code when executed with yarn ts-node bugtest.ts Test\ Table.pdf
or equivalent using the attached PDF
import { UnstructuredClient } from "unstructured-client";
import { Strategy } from "unstructured-client/sdk/models/shared/index.js";
import * as fs from "fs";
const key = "MY_KEY";
const client = new UnstructuredClient({
serverURL: "https://api.unstructuredapp.io",
security: {
apiKeyAuth: key,
},
});
// Get filename from command line arguments
const filename = process.argv[2];
if (!filename) {
console.error('Please provide a filename as a command line argument');
process.exit(1);
}
const data = fs.readFileSync(filename);
client.general.partition({
partitionParameters: {
files: {
content: data,
fileName: filename,
},
strategy: Strategy.HiRes,
}
}).then((res: any) => {
if (res.statusCode === 200) {
console.log(res.elements);
} else {
console.error("API call failed with status code:", res.statusCode);
}
}).catch((e) => {
console.error(e.statusCode);
console.error(e.body);
});
Expected behavior
The HTML table should contain the same text as the text output. Instead the output is:
{
type: 'Table',
element_id: '78250fc2ab718b4f7b441c3202ec7ec2',
text: 'COL1 COL2 COL3 TEST // TEST // TEST MOCK DATA MOCK DATA TEST // TEST // TEST MOCK DATA MOCK DATA TEST // TEST // TEST MOCK DATA MOCK DATA TEST // TEST // TEST MOCK DATA MOCK DATA',
metadata: {
text_as_html: '<table><thead><tr><th>110D</th><th>COL3\n' +
' COL2</th></tr></thead><tbody><tr><td colspan="2">VIVA JIDO0N| VIVA AD0NW| LSHL // LSHL // LSHL</td></tr><tr><td colspan="2">VIVA JIDO0N| VIVA AD0NW| LSHL // LSHL // LSHL</td></tr><tr><td colspan="2">VIVA JIDO0N| VIVA AD0NW| LSHL // LSHL // LSHL</td></tr><tr><td>LSHL // LSHL // LSHL</td><td>VIVA JIDO0N| VIVA AD0NW|</td></tr></tbody></table>',
filetype: 'application/pdf',
languages: [Array],
page_number: 1,
parent_id: 'bb9d976cc755b408de05367845ca47a3',
filename: 'Test Table.pdf'
}