Skip to content

Commit bd7f540

Browse files
committed
Update categorization
1 parent f2c0622 commit bd7f540

15 files changed

Lines changed: 684 additions & 62 deletions

app/Events/ReleaseNameFixed.php

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace App\Events;
6+
7+
use Illuminate\Foundation\Events\Dispatchable;
8+
use Illuminate\Queue\SerializesModels;
9+
10+
class ReleaseNameFixed
11+
{
12+
use Dispatchable;
13+
use SerializesModels;
14+
15+
public function __construct(
16+
public readonly int $releaseId,
17+
public readonly string $oldName,
18+
public readonly string $newName,
19+
public readonly int $oldCategoryId,
20+
public readonly int|string $groupId,
21+
public readonly string $poster = '',
22+
) {}
23+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace App\Listeners;
6+
7+
use App\Events\ReleaseNameFixed;
8+
use App\Models\Release;
9+
use App\Services\Categorization\CategorizationService;
10+
use Illuminate\Support\Facades\Log;
11+
12+
class RecategorizeReleaseAfterNameFix
13+
{
14+
public function __construct(private readonly CategorizationService $categorization) {}
15+
16+
public function handle(ReleaseNameFixed $event): void
17+
{
18+
$release = Release::query()->find($event->releaseId, [
19+
'id',
20+
'groups_id',
21+
'fromname',
22+
'categories_id',
23+
'iscategorized',
24+
'searchname',
25+
]);
26+
27+
if ($release === null) {
28+
return;
29+
}
30+
31+
$result = $this->categorization->determineCategory(
32+
$release->groups_id,
33+
$event->newName,
34+
(string) ($release->fromname ?? $event->poster)
35+
);
36+
37+
$newCategoryId = (int) ($result['categories_id'] ?? $release->categories_id);
38+
39+
if ((int) $release->categories_id === $newCategoryId && (int) $release->iscategorized === 1) {
40+
return;
41+
}
42+
43+
Release::query()
44+
->where('id', $release->id)
45+
->update([
46+
'categories_id' => $newCategoryId,
47+
'iscategorized' => 1,
48+
]);
49+
50+
if (config('nntmux.categorization.log', false)) {
51+
Log::info('categorization.rename_recategorized', [
52+
'release_id' => $release->id,
53+
'old_name' => $event->oldName,
54+
'new_name' => $event->newName,
55+
'old_category_id' => $event->oldCategoryId,
56+
'new_category_id' => $newCategoryId,
57+
]);
58+
}
59+
}
60+
}

app/Providers/AppServiceProvider.php

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
namespace App\Providers;
66

7+
use App\Events\ReleaseNameFixed;
8+
use App\Listeners\RecategorizeReleaseAfterNameFix;
79
use App\Models\AnidbInfo;
810
use App\Models\AnidbTitle;
911
use App\Models\BookInfo;
@@ -59,6 +61,7 @@ public function boot(): void
5961
return $user->hasRole('Admin');
6062
});
6163
Event::listen(Login::class, LoginViaRemember::class);
64+
Event::listen(ReleaseNameFixed::class, RecategorizeReleaseAfterNameFix::class);
6265

6366
// Register observers
6467
RolePromotion::observe(RolePromotionObserver::class);

app/Services/Categorization/CategorizationPipeline.php

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
use App\Services\Categorization\Pipes\CategorizationPassable;
1111
use Illuminate\Pipeline\Pipeline;
1212
use Illuminate\Support\Collection;
13+
use Illuminate\Support\Facades\Log;
1314

1415
/**
1516
* Pipeline-based categorization service using Laravel Pipeline.
@@ -88,9 +89,34 @@ public function categorize(
8889
->through($this->pipes->values()->all())
8990
->thenReturn();
9091

92+
$this->logCategorization($result);
93+
9194
return $result->toArray();
9295
}
9396

97+
protected function logCategorization(CategorizationPassable $result): void
98+
{
99+
if (! config('nntmux.categorization.log', false)) {
100+
return;
101+
}
102+
103+
$payload = [
104+
'release_name' => $result->context->releaseName,
105+
'group_name' => $result->context->groupName,
106+
'category_id' => $result->bestResult->categoryId,
107+
'matched_by' => $result->bestResult->matchedBy,
108+
'confidence' => $result->bestResult->confidence,
109+
'locked_to_misc' => $result->lockedToMisc,
110+
'misc_analysis' => $result->miscAnalysis,
111+
];
112+
113+
if ($result->lockedToMisc || $result->bestResult->matchedBy === 'group_only_low_signal') {
114+
Log::info('categorization.decision', $payload);
115+
}
116+
117+
Log::debug('categorization.trace', $payload + ['all_results' => $result->allResults]);
118+
}
119+
94120
/**
95121
* Get all registered categorizers (pipes).
96122
*
@@ -107,6 +133,7 @@ public function getCategorizers(): Collection // @phpstan-ignore missingType.gen
107133
public static function createDefault(): self
108134
{
109135
return new self([
136+
new Pipes\MiscPipe,
110137
new Pipes\GroupNamePipe,
111138
new Pipes\XxxPipe,
112139
new Pipes\TvPipe,
@@ -115,7 +142,7 @@ public static function createDefault(): self
115142
new Pipes\MusicPipe,
116143
new Pipes\PcPipe,
117144
new Pipes\ConsolePipe,
118-
new Pipes\MiscPipe,
145+
new Pipes\MiscSafetyNetPipe,
119146
]);
120147
}
121148
}

app/Services/Categorization/CategorizationResult.php

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ public function isSuccessful(): bool
5151
*/
5252
public function shouldOverride(CategorizationResult $other): bool
5353
{
54+
if ($this->categoryId !== Category::OTHER_MISC && $this->categoryId !== Category::OTHER_HASHED &&
55+
$other->isProtectedMiscResult() && $this->confidence < 0.6) {
56+
return false;
57+
}
58+
5459
// Higher confidence always wins
5560
if ($this->confidence > $other->confidence) {
5661
return true;
@@ -64,6 +69,25 @@ public function shouldOverride(CategorizationResult $other): bool
6469
return false;
6570
}
6671

72+
/**
73+
* Determine whether this result should resist weak downstream overrides.
74+
*/
75+
public function isProtectedMiscResult(): bool
76+
{
77+
if ($this->categoryId === Category::OTHER_HASHED) {
78+
return true;
79+
}
80+
81+
if ($this->categoryId !== Category::OTHER_MISC) {
82+
return false;
83+
}
84+
85+
return str_starts_with($this->matchedBy, 'hash_')
86+
|| str_starts_with($this->matchedBy, 'obfuscated_')
87+
|| str_starts_with($this->matchedBy, 'gibberish_')
88+
|| $this->matchedBy === 'group_only_low_signal';
89+
}
90+
6791
/**
6892
* Create a failed/empty result.
6993
*/

app/Services/Categorization/Categorizers/GroupNameCategorizer.php

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,25 +24,25 @@ public function categorize(ReleaseContext $context): CategorizationResult
2424
return $this->noMatch();
2525
}
2626
if (preg_match('/alt\.binaries\..*?(tv|hdtv|tvseries)/i', $groupName)) {
27-
return $this->matched(Category::TV_OTHER, 0.6, 'group_tv');
27+
return $this->matched(Category::TV_OTHER, 0.6, 'group_name_tv');
2828
}
2929
if (preg_match('/alt\.binaries\..*?(movies?|dvd|bluray|x264)/i', $groupName)) {
30-
return $this->matched(Category::MOVIE_OTHER, 0.6, 'group_movie');
30+
return $this->matched(Category::MOVIE_OTHER, 0.6, 'group_name_movie');
3131
}
3232
if (preg_match('/alt\.binaries\..*?(erotica|pictures\.erotica|xxx)/i', $groupName)) {
33-
return $this->matched(Category::XXX_OTHER, 0.7, 'group_xxx');
33+
return $this->matched(Category::XXX_OTHER, 0.7, 'group_name_xxx');
3434
}
3535
if (preg_match('/alt\.binaries\..*?(sounds?|mp3|music|lossless)/i', $groupName)) {
36-
return $this->matched(Category::MUSIC_OTHER, 0.6, 'group_music');
36+
return $this->matched(Category::MUSIC_OTHER, 0.6, 'group_name_music');
3737
}
3838
if (preg_match('/alt\.binaries\..*?(games?|console|psx|nintendo)/i', $groupName)) {
39-
return $this->matched(Category::GAME_OTHER, 0.6, 'group_game');
39+
return $this->matched(Category::GAME_OTHER, 0.6, 'group_name_game');
4040
}
4141
if (preg_match('/alt\.binaries\..*?(warez|0day|apps?|software)/i', $groupName)) {
42-
return $this->matched(Category::PC_0DAY, 0.6, 'group_pc');
42+
return $this->matched(Category::PC_0DAY, 0.6, 'group_name_pc');
4343
}
4444
if (preg_match('/alt\.binaries\..*?(e-?book|ebook|comics?)/i', $groupName)) {
45-
return $this->matched(Category::BOOKS_EBOOK, 0.5, 'group_book');
45+
return $this->matched(Category::BOOKS_EBOOK, 0.5, 'group_name_book');
4646
}
4747

4848
return $this->noMatch();

app/Services/Categorization/Categorizers/MiscCategorizer.php

Lines changed: 98 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,21 @@ public function categorize(ReleaseContext $context): CategorizationResult
3434
return $result;
3535
}
3636

37+
$analysis = $this->inspectSignals($name);
38+
if ($this->isZeroVowelLongToken($analysis['coreName'])) {
39+
return $this->matched(Category::OTHER_HASHED, 0.78, 'gibberish_zero_vowels');
40+
}
41+
3742
// Check for obfuscated/encoded patterns
3843
if ($result = $this->checkObfuscated($name)) {
3944
return $result;
4045
}
4146

47+
// Check low-signal names that only contain random-looking tokens
48+
if ($result = $this->checkLowSignal($name)) {
49+
return $result;
50+
}
51+
4252
// Check for gibberish patterns (character-analysis heuristics)
4353
if ($result = $this->checkGibberish($name)) {
4454
return $result;
@@ -57,6 +67,50 @@ public function categorize(ReleaseContext $context): CategorizationResult
5767
return $this->noMatch();
5868
}
5969

70+
/**
71+
* Inspect a release name for media signal markers used by the safety-net pipe.
72+
*
73+
* @return array{coreName: string, coreLength: int, signalScore: int, markers: list<string>, lowSignal: bool}
74+
*/
75+
public function inspectSignals(ReleaseContext|string $context): array
76+
{
77+
$name = $context instanceof ReleaseContext ? $context->releaseName : $context;
78+
$cleaned = $this->stripExtensionsForAnalysis($name);
79+
$coreName = $this->getCoreNameWithoutSeparators($cleaned);
80+
81+
$patterns = [
82+
'season_episode' => '/\bS\d{1,3}[._ -]?E\d{1,4}\b/i',
83+
'season_pack' => '/\bS\d{1,3}\b/i',
84+
'resolution' => '/\b(480p|576p|720p|1080[pi]?|2160p|4k|uhd)\b/i',
85+
'codec' => '/\b(x264|x265|h\.?264|h\.?265|hevc|xvid|av1)\b/i',
86+
'source' => '/\b(bluray|bdrip|brrip|hdtv|web[._ -]?dl|web[._ -]?rip|dvdrip|remux)\b/i',
87+
'audio' => '/\b(aac|ac3|ddp|dts|flac|mp3)\b/i',
88+
'scene_tag' => '/\b(proper|repack|internal|limited|complete|dubbed|subbed|readnfo)\b/i',
89+
'year' => '/\b(19|20)\d{2}\b/',
90+
'release_group' => '/-[A-Za-z0-9][A-Za-z0-9._-]{1,20}$/',
91+
'known_extension' => '/\.(mkv|avi|mp4|mp3|flac|iso|epub|pdf|exe|nzb|rar|7z)$/i',
92+
];
93+
94+
$markers = [];
95+
foreach ($patterns as $marker => $pattern) {
96+
if (preg_match($pattern, $name)) {
97+
$markers[] = $marker;
98+
}
99+
}
100+
101+
$signalScore = count($markers);
102+
$isCoreToken = preg_match('/^[A-Za-z0-9+\/_=-]+$/', $coreName) === 1;
103+
$lowSignal = $signalScore === 0 && $isCoreToken && strlen($coreName) >= 12;
104+
105+
return [
106+
'coreName' => $coreName,
107+
'coreLength' => strlen($coreName),
108+
'signalScore' => $signalScore,
109+
'markers' => $markers,
110+
'lowSignal' => $lowSignal,
111+
];
112+
}
113+
60114
protected function checkHash(string $name): ?CategorizationResult
61115
{
62116
// MD5 hash (32 hex characters)
@@ -79,6 +133,10 @@ protected function checkHash(string $name): ?CategorizationResult
79133
return $this->matched(Category::OTHER_HASHED, 0.95, 'hash_generic');
80134
}
81135

136+
if ($this->isBase64LikeToken($name)) {
137+
return $this->matched(Category::OTHER_HASHED, 0.9, 'hash_base64_like');
138+
}
139+
82140
// Strip extensions and separators for core-name checks
83141
$cleaned = $this->stripExtensionsForAnalysis($name);
84142
$coreName = $this->getCoreNameWithoutSeparators($cleaned);
@@ -115,7 +173,18 @@ protected function checkObfuscated(string $name): ?CategorizationResult
115173

116174
// Only punctuation and numbers with no clear structure
117175
if ($this->isObfuscatedPunctuation($name)) {
118-
return $this->matched(Category::OTHER_MISC, 0.5, 'obfuscated_pattern');
176+
$analysis = $this->inspectSignals($name);
177+
$hashLike = $this->isBase64LikeToken($name)
178+
|| $this->isBoundedGenericHash($name)
179+
|| $this->isZeroVowelLongToken($analysis['coreName'], 12)
180+
|| $analysis['lowSignal'];
181+
182+
return $this->matched(
183+
$hashLike ? Category::OTHER_HASHED : Category::OTHER_MISC,
184+
$hashLike ? 0.75 : 0.5,
185+
'obfuscated_pattern',
186+
['signal_score' => $analysis['signalScore'], 'markers' => $analysis['markers']]
187+
);
119188
}
120189

121190
return null;
@@ -146,6 +215,34 @@ protected function checkGibberish(string $name): ?CategorizationResult
146215
return $this->matched(Category::OTHER_HASHED, 0.7, 'gibberish_random_digits');
147216
}
148217

218+
if ($this->isZeroVowelLongToken($coreName)) {
219+
return $this->matched(Category::OTHER_HASHED, 0.78, 'gibberish_zero_vowels');
220+
}
221+
222+
return null;
223+
}
224+
225+
protected function checkLowSignal(string $name): ?CategorizationResult
226+
{
227+
$analysis = $this->inspectSignals($name);
228+
229+
if ($this->isZeroVowelLongToken($analysis['coreName'])) {
230+
return null;
231+
}
232+
233+
if ($analysis['lowSignal'] && $analysis['coreLength'] >= 20) {
234+
return $this->matched(
235+
Category::OTHER_HASHED,
236+
0.8,
237+
'gibberish_no_signal',
238+
[
239+
'signal_score' => $analysis['signalScore'],
240+
'markers' => $analysis['markers'],
241+
'core_length' => $analysis['coreLength'],
242+
]
243+
);
244+
}
245+
149246
return null;
150247
}
151248

app/Services/Categorization/Pipes/CategorizationPassable.php

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ class CategorizationPassable
2929
*/
3030
public array $allResults = [];
3131

32+
/**
33+
* Misc signal analysis captured by MiscPipe for downstream safety-net decisions.
34+
*
35+
* @var array<string, mixed>
36+
*/
37+
public array $miscAnalysis = [];
38+
3239
public function __construct(ReleaseContext $context, bool $debug = false)
3340
{
3441
$this->context = $context;
@@ -90,6 +97,7 @@ public function toArray(): array
9097
'locked_to_misc' => $this->lockedToMisc,
9198
'release_name' => $this->context->releaseName,
9299
'group_name' => $this->context->groupName,
100+
'misc_analysis' => $this->miscAnalysis,
93101
'all_results' => $this->allResults,
94102
'categorizer_details' => $this->bestResult->debug,
95103
];

0 commit comments

Comments
 (0)