-
-
Notifications
You must be signed in to change notification settings - Fork 296
Open
Description
Hi
The indexing too slow, is there any way to make it run faster? I am in AMD3900x 12 cors cpu with 128GB ram, index a 6MB pdf need over 1 min and I have thousands of books to index. Thanks
It suck on these code:
$indexer->insert([
'id' => $currentId,
'filename' => $filename,
'content' => $content,
]);
Full source code:
<?php
require_once __DIR__ . '/vendor/autoload.php';
use Smalot\PdfParser\Parser;
use TeamTNT\TNTSearch\TNTSearch;
// Ensure the tntsearch directory exists and is writable
$indexDir = __DIR__ . '/indexDir';
if (!is_dir($indexDir)) {
if (!mkdir($indexDir, 0777, true) && !is_dir($indexDir)) {
die("Failed to create indexDir directory: $indexDir\n");
}
}
if (!is_writable($indexDir)) {
die("indexDir directory is not writable: $indexDir\n");
}
// Prepare main database for books
$bookDBPath = __DIR__ . '/indexDir/books';
$bookDB = new PDO('sqlite:' . $bookDBPath);
// Create books table if it doesn't exist
$bookDB->exec("CREATE TABLE IF NOT EXISTS books (id INTEGER PRIMARY KEY, filename TEXT UNIQUE, content TEXT)");
$bookDB->exec("CREATE TABLE IF NOT EXISTS failed_books (id INTEGER PRIMARY KEY, filename TEXT UNIQUE)");
// Get already indexed filenames
$existingFiles = [];
foreach ($bookDB->query('SELECT filename FROM books') as $row) {
$existingFiles[$row['filename']] = true;
}
foreach ($bookDB->query('SELECT filename FROM failed_books') as $row) {
$existingFiles[$row['filename']] = true;
}
$parser = new Parser();
$documents = [];
// Find the max id for new inserts
$id = (int)$bookDB->query('SELECT MAX(id) FROM books')->fetchColumn();
$id = $id ? $id + 1 : 1;
$id2 = (int)$bookDB->query('SELECT MAX(id) FROM failed_books')->fetchColumn();
$id2 = $id2 ? $id2 + 1 : 1;
if ($id2 > $id) {
$id = $id2; // Ensure we start from the highest ID
}
// Scan the "book" folder for PDFs
$bookDir = __DIR__ . '/book';
$pdfFiles = glob($bookDir . '/*.pdf');
// randomize $pdfFiles
shuffle($pdfFiles);
// Initialize TNTSearch ONCE outside the loop
echo "Initializing TNTSearch...\n";
$tnt = new TNTSearch();
$tnt->loadConfig([
'storage' => __DIR__ . '/indexDir',
'driver' => 'sqlite',
'database' => __DIR__ . '/indexDir/books',
'charset' => 'utf8',
'collation' => 'utf8_unicode_ci',
]);
$indexName = 'books_index';
if (!file_exists(__DIR__ . "/indexDir/{$indexName}")) {
echo "Creating initial index...\n";
$indexer = $tnt->createIndex($indexName);
$indexer->setPrimaryKey('id');
$indexer->query('SELECT id, filename, content FROM books LIMIT 0;'); // Empty initial query
$indexer->run();
}
$tnt->selectIndex($indexName);
$indexer = $tnt->getIndex();
// Start transaction for better DB performance
$bookDB->beginTransaction();
$transactionCount = 0;
$batchSize = 5; // Commit every 5 inserts
$x=0;
foreach ($pdfFiles as $pdfFile) {
$filename = basename($pdfFile);
if (isset($existingFiles[$filename])) {
// Already indexed
continue;
}
try {
$startTime = microtime(true);
echo $x." Processing: $pdfFile\n";
// Check if book exists in bookDB (by filename)
$stmt = $bookDB->prepare('SELECT COUNT(*) FROM books WHERE filename = :filename');
$stmt->execute([':filename' => $filename]);
$bookExists = $stmt->fetchColumn() > 0;
if ($bookExists) {
echo "Book already exists in DB: $filename\n";
continue;
}
// Check if file is a valid PDF (simple header check)
$fh = fopen($pdfFile, 'rb');
$header = fread($fh, 5);
fclose($fh);
if ($header !== '%PDF-') {
echo "Skipping invalid PDF: $pdfFile\n";
continue;
}
echo " Parsing PDF: $filename\n";
/*
$pdf = $parser->parseFile($pdfFile);
$content = $pdf->getText();
*/
// Prepare the PDF data
$pdfContent = file_get_contents($pdfFile);
$pdfBase64 = base64_encode($pdfContent);
// Secret key from parsePhp.php
$secretKey = "ksajdksajdkasjdhkasjdhkashdka";
// API URL with secret parameter
$apiUrl = 'https://www.hkprog.org/wp-content/plugins/hkps-wordpress-plugin/api/parsePdf.php?secret=' . $secretKey;
// Prepare the POST data
$postData = json_encode([
'pdfData' => $pdfBase64
]);
// Initialize cURL
$ch = curl_init($apiUrl);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Content-Type: application/json',
'Content-Length: ' . strlen($postData)
]);
// Optional: Set timeout
curl_setopt($ch, CURLOPT_TIMEOUT, 30);
// Execute and get response
$content = curl_exec($ch);
// Check for errors
if (curl_errno($ch)) {
echo 'cURL Error: ' . curl_error($ch);
} else {
// echo 'Response: ' . $response;
}
curl_close($ch);
// Optionally truncate very large content to reduce indexing time
// Uncomment if you want to limit content size:
// $maxContentLength = 200000; // ~200KB
// if (strlen($content) > $maxContentLength) {
// echo " Truncating large content from " . strlen($content) . " to $maxContentLength bytes\n";
// $content = substr($content, 0, $maxContentLength);
// }
// Insert into SQLite immediately after processing
echo " Inserting into DB: $filename (content: ".strlen($content)." bytes)\n";
$currentId = $id++;
$stmt = $bookDB->prepare("INSERT INTO books (id, filename, content) VALUES (:id, :filename, :content)");
$stmt->execute([
':id' => $currentId,
':filename' => $filename,
':content' => $content,
]);
// Commit transaction periodically
$transactionCount++;
if ($transactionCount >= $batchSize) {
$bookDB->commit();
$bookDB->beginTransaction();
$transactionCount = 0;
echo " Committed batch to DB\n";
}
// Index with TNTSearch (reusing the same indexer object)
echo " Indexing with TNTSearch...\n";
$indexer->insert([
'id' => $currentId,
'filename' => $filename,
'content' => $content,
]);
$endTime = microtime(true);
$processingTime = round($endTime - $startTime, 2);
echo " ✓ Completed: $filename (took {$processingTime}s)\n";
} catch (Throwable $e) {
echo "Error processing $filename: " . $e->getMessage() . "\n";
$stmt = $bookDB->prepare("INSERT INTO failed_books (id, filename) VALUES (:id, :filename)");
$stmt->execute([
':id' => $id++,
':filename' => $filename
]);
$stmt = $bookDB->prepare("delete from books where filename = :filename");
$stmt->execute([
':filename' => $filename
]);
$transactionCount++;
}
$x++;
}
// Commit any remaining transactions
if ($transactionCount > 0) {
$bookDB->commit();
echo "Final commit to DB\n";
}
echo "Indexing complete.\n";
Metadata
Metadata
Assignees
Labels
No labels