Skip to content

indexing too slow #360

@quantrpeter

Description

@quantrpeter

Hi
The indexing too slow, is there any way to make it run faster? I am in AMD3900x 12 cors cpu with 128GB ram, index a 6MB pdf need over 1 min and I have thousands of books to index. Thanks

It suck on these code:

		$indexer->insert([
			'id' => $currentId,
			'filename' => $filename,
			'content' => $content,
		]);

Full source code:

<?php
require_once __DIR__ . '/vendor/autoload.php';

use Smalot\PdfParser\Parser;
use TeamTNT\TNTSearch\TNTSearch;

// Ensure the tntsearch directory exists and is writable
$indexDir = __DIR__ . '/indexDir';
if (!is_dir($indexDir)) {
	if (!mkdir($indexDir, 0777, true) && !is_dir($indexDir)) {
		die("Failed to create indexDir directory: $indexDir\n");
	}
}
if (!is_writable($indexDir)) {
	die("indexDir directory is not writable: $indexDir\n");
}

// Prepare main database for books
$bookDBPath = __DIR__ . '/indexDir/books';
$bookDB = new PDO('sqlite:' . $bookDBPath);

// Create books table if it doesn't exist
$bookDB->exec("CREATE TABLE IF NOT EXISTS books (id INTEGER PRIMARY KEY, filename TEXT UNIQUE, content TEXT)");
$bookDB->exec("CREATE TABLE IF NOT EXISTS failed_books (id INTEGER PRIMARY KEY, filename TEXT UNIQUE)");

// Get already indexed filenames
$existingFiles = [];
foreach ($bookDB->query('SELECT filename FROM books') as $row) {
	$existingFiles[$row['filename']] = true;
}
foreach ($bookDB->query('SELECT filename FROM failed_books') as $row) {
	$existingFiles[$row['filename']] = true;
}

$parser = new Parser();
$documents = [];

// Find the max id for new inserts
$id = (int)$bookDB->query('SELECT MAX(id) FROM books')->fetchColumn();
$id = $id ? $id + 1 : 1;


$id2 = (int)$bookDB->query('SELECT MAX(id) FROM failed_books')->fetchColumn();
$id2 = $id2 ? $id2 + 1 : 1;

if ($id2 > $id) {
	$id = $id2; // Ensure we start from the highest ID
}

// Scan the "book" folder for PDFs
$bookDir = __DIR__ . '/book';
$pdfFiles = glob($bookDir . '/*.pdf');

// randomize $pdfFiles
shuffle($pdfFiles);

// Initialize TNTSearch ONCE outside the loop
echo "Initializing TNTSearch...\n";
$tnt = new TNTSearch();
$tnt->loadConfig([
	'storage'   => __DIR__ . '/indexDir',
	'driver'    => 'sqlite',
	'database'  => __DIR__ . '/indexDir/books',
	'charset'   => 'utf8',
	'collation' => 'utf8_unicode_ci',
]);

$indexName = 'books_index';
if (!file_exists(__DIR__ . "/indexDir/{$indexName}")) {
	echo "Creating initial index...\n";
	$indexer = $tnt->createIndex($indexName);
	$indexer->setPrimaryKey('id');
	$indexer->query('SELECT id, filename, content FROM books LIMIT 0;'); // Empty initial query
	$indexer->run();
}

$tnt->selectIndex($indexName);
$indexer = $tnt->getIndex();

// Start transaction for better DB performance
$bookDB->beginTransaction();
$transactionCount = 0;
$batchSize = 5; // Commit every 5 inserts

$x=0;
foreach ($pdfFiles as $pdfFile) {
	$filename = basename($pdfFile);
	if (isset($existingFiles[$filename])) {
		// Already indexed
		continue;
	}
	try {
		$startTime = microtime(true);
		echo $x." Processing: $pdfFile\n";

		// Check if book exists in bookDB (by filename)
		$stmt = $bookDB->prepare('SELECT COUNT(*) FROM books WHERE filename = :filename');
		$stmt->execute([':filename' => $filename]);
		$bookExists = $stmt->fetchColumn() > 0;
		if ($bookExists) {
			echo "Book already exists in DB: $filename\n";
			continue;
		}

		// Check if file is a valid PDF (simple header check)
		$fh = fopen($pdfFile, 'rb');
		$header = fread($fh, 5);
		fclose($fh);
		if ($header !== '%PDF-') {
			echo "Skipping invalid PDF: $pdfFile\n";
			continue;
		}

		echo "    Parsing PDF: $filename\n";
		/*
		$pdf = $parser->parseFile($pdfFile);
		$content = $pdf->getText();
		*/

		// Prepare the PDF data
		$pdfContent = file_get_contents($pdfFile);
		$pdfBase64 = base64_encode($pdfContent);

		// Secret key from parsePhp.php
		$secretKey = "ksajdksajdkasjdhkasjdhkashdka";

		// API URL with secret parameter
		$apiUrl = 'https://www.hkprog.org/wp-content/plugins/hkps-wordpress-plugin/api/parsePdf.php?secret=' . $secretKey;

		// Prepare the POST data
		$postData = json_encode([
			'pdfData' => $pdfBase64
		]);

		// Initialize cURL
		$ch = curl_init($apiUrl);
		curl_setopt($ch, CURLOPT_POST, 1);
		curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
		curl_setopt($ch, CURLOPT_HTTPHEADER, [
			'Content-Type: application/json',
			'Content-Length: ' . strlen($postData)
		]);

		// Optional: Set timeout
		curl_setopt($ch, CURLOPT_TIMEOUT, 30);

		// Execute and get response
		$content = curl_exec($ch);

		// Check for errors
		if (curl_errno($ch)) {
			echo 'cURL Error: ' . curl_error($ch);
		} else {
			// echo 'Response: ' . $response;
		}

		curl_close($ch);

		// Optionally truncate very large content to reduce indexing time
		// Uncomment if you want to limit content size:
		// $maxContentLength = 200000; // ~200KB
		// if (strlen($content) > $maxContentLength) {
		//     echo "    Truncating large content from " . strlen($content) . " to $maxContentLength bytes\n";
		//     $content = substr($content, 0, $maxContentLength);
		// }

		// Insert into SQLite immediately after processing
		echo "    Inserting into DB: $filename (content: ".strlen($content)." bytes)\n";
		$currentId = $id++;
		$stmt = $bookDB->prepare("INSERT INTO books (id, filename, content) VALUES (:id, :filename, :content)");
		$stmt->execute([
			':id' => $currentId,
			':filename' => $filename,
			':content' => $content,
		]);

		// Commit transaction periodically
		$transactionCount++;
		if ($transactionCount >= $batchSize) {
			$bookDB->commit();
			$bookDB->beginTransaction();
			$transactionCount = 0;
			echo "    Committed batch to DB\n";
		}

		// Index with TNTSearch (reusing the same indexer object)
		echo "    Indexing with TNTSearch...\n";
		$indexer->insert([
			'id' => $currentId,
			'filename' => $filename,
			'content' => $content,
		]);

		$endTime = microtime(true);
		$processingTime = round($endTime - $startTime, 2);
		echo "    ✓ Completed: $filename (took {$processingTime}s)\n";
	} catch (Throwable $e) {
		echo "Error processing $filename: " . $e->getMessage() . "\n";
		$stmt = $bookDB->prepare("INSERT INTO failed_books (id, filename) VALUES (:id, :filename)");
		$stmt->execute([
			':id' => $id++,
			':filename' => $filename
		]);
		$stmt = $bookDB->prepare("delete from books where filename = :filename");
		$stmt->execute([
			':filename' => $filename
		]);
		$transactionCount++;
	}
	$x++;
}

// Commit any remaining transactions
if ($transactionCount > 0) {
	$bookDB->commit();
	echo "Final commit to DB\n";
}

echo "Indexing complete.\n";


Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions