mirror of
https://git.friendi.ca/friendica/friendica-addons.git
synced 2025-06-07 18:04:26 +02:00
tesseract/tesseract.php aktualisiert
Modified addon. The addon used to generate a large system load and led to the server being unavailable. The changes help to make better use of system resources. - Creates and removes tesseract-limited.sh to limit the system resources for tesseract (timeout/resource limits). - Checks permitted formats to avoid wasting resources.
This commit is contained in:
parent
45d3a6fb74
commit
219e10c270
1 changed files with 83 additions and 6 deletions
|
@ -1,9 +1,10 @@
|
|||
<?php
|
||||
/**
|
||||
* Name: Tesseract OCR
|
||||
* Description: Use OCR to get text from images
|
||||
* Version: 0.1
|
||||
* Description: Use OCR to extract text from images (with timeout, resource limits, alt-text & format checks)
|
||||
* Version: 0.2
|
||||
* Author: Michael Vogel <http://pirati.ca/profile/heluecht>
|
||||
* * Modified by: Matthias Ebers <http://loma.ml/profile/feb>
|
||||
*/
|
||||
|
||||
use Friendica\Core\Hook;
|
||||
|
@ -13,25 +14,101 @@ use thiagoalessio\TesseractOCR\TesseractOCR;
|
|||
|
||||
require_once __DIR__ . DIRECTORY_SEPARATOR . 'vendor' . DIRECTORY_SEPARATOR . 'autoload.php';
|
||||
|
||||
/**
|
||||
* Called when the addon is enabled
|
||||
*/
|
||||
function tesseract_install()
|
||||
{
|
||||
Hook::register('ocr-detection', __FILE__, 'tesseract_ocr_detection');
|
||||
|
||||
DI::logger()->notice('installed tesseract');
|
||||
$wrapperPath = __DIR__ . '/tesseract-limited.sh';
|
||||
|
||||
// Create a wrapper script with timeout and resource constraints
|
||||
if (!file_exists($wrapperPath)) {
|
||||
$script = <<<BASH
|
||||
#!/bin/bash
|
||||
# Wrapper for tesseract with timeout and resource limits
|
||||
timeout 5s nice -n 10 ionice -c3 /usr/bin/tesseract "\$@"
|
||||
BASH;
|
||||
|
||||
file_put_contents($wrapperPath, $script);
|
||||
chmod($wrapperPath, 0755);
|
||||
|
||||
DI::logger()->notice('Tesseract wrapper script created', ['path' => $wrapperPath]);
|
||||
} else {
|
||||
DI::logger()->info('Tesseract wrapper script already exists', ['path' => $wrapperPath]);
|
||||
}
|
||||
|
||||
DI::logger()->notice('Tesseract OCR addon installed');
|
||||
}
|
||||
|
||||
/**
|
||||
* Called when the addon is disabled
|
||||
*/
|
||||
function tesseract_uninstall()
|
||||
{
|
||||
$wrapperPath = __DIR__ . '/tesseract-limited.sh';
|
||||
|
||||
if (file_exists($wrapperPath)) {
|
||||
unlink($wrapperPath);
|
||||
DI::logger()->notice('Tesseract wrapper script removed', ['path' => $wrapperPath]);
|
||||
}
|
||||
|
||||
Hook::unregister('ocr-detection', __FILE__, 'tesseract_ocr_detection');
|
||||
DI::logger()->notice('Tesseract OCR addon uninstalled');
|
||||
}
|
||||
|
||||
/**
|
||||
* Main OCR processing hook for incoming images
|
||||
*/
|
||||
function tesseract_ocr_detection(&$media)
|
||||
{
|
||||
// Skip OCR if image already contains an alt-text
|
||||
if (!empty($media['description'])) {
|
||||
DI::logger()->debug('Image already has description, skipping OCR');
|
||||
return;
|
||||
}
|
||||
|
||||
// Only allow specific MIME types for OCR
|
||||
$allowedTypes = ['image/jpeg', 'image/png', 'image/bmp', 'image/tiff'];
|
||||
if (!empty($media['type']) && !in_array($media['type'], $allowedTypes)) {
|
||||
DI::logger()->debug('Unsupported image type for OCR', ['type' => $media['type']]);
|
||||
return;
|
||||
}
|
||||
|
||||
// Alternatively skip GIF files based on filename
|
||||
if (empty($media['type']) && !empty($media['filename']) && preg_match('/\.gif$/i', $media['filename'])) {
|
||||
DI::logger()->debug('GIF image detected via filename, skipping OCR');
|
||||
return;
|
||||
}
|
||||
|
||||
$ocr = new TesseractOCR();
|
||||
|
||||
try {
|
||||
// Use wrapper script with timeout and niceness
|
||||
$ocr->executable(__DIR__ . '/tesseract-limited.sh');
|
||||
|
||||
// Detect and set available languages
|
||||
$languages = $ocr->availableLanguages();
|
||||
if ($languages) {
|
||||
/** @phpstan-ignore-next-line ignore call of \thiagoalessio\TesseractOCR\Option::lang() */
|
||||
$ocr->lang(implode('+', $languages));
|
||||
}
|
||||
|
||||
// Use Friendica's temporary path
|
||||
$ocr->tempDir(System::getTempPath());
|
||||
|
||||
// Provide raw image data to Tesseract
|
||||
$ocr->imageData($media['img_str'], strlen($media['img_str']));
|
||||
$media['description'] = $ocr->run();
|
||||
|
||||
// Run OCR and assign description if text is found
|
||||
$text = trim($ocr->run());
|
||||
|
||||
if (!empty($text)) {
|
||||
$media['description'] = $text;
|
||||
DI::logger()->debug('OCR text detected', ['text' => $text]);
|
||||
} else {
|
||||
DI::logger()->debug('No text detected in image');
|
||||
}
|
||||
} catch (\Throwable $th) {
|
||||
DI::logger()->info('Error calling TesseractOCR', ['message' => $th->getMessage()]);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue