Skip to content

Commit bbc54fe

Browse files
committed
Update other categorization
1 parent bd7f540 commit bbc54fe

5 files changed

Lines changed: 113 additions & 2 deletions

File tree

app/Services/Categorization/Categorizers/MiscCategorizer.php

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,10 @@ public function inspectSignals(ReleaseContext|string $context): array
100100

101101
$signalScore = count($markers);
102102
$isCoreToken = preg_match('/^[A-Za-z0-9+\/_=-]+$/', $coreName) === 1;
103-
$lowSignal = $signalScore === 0 && $isCoreToken && strlen($coreName) >= 12;
103+
$lowSignal = $signalScore === 0
104+
&& $isCoreToken
105+
&& strlen($coreName) >= 12
106+
&& ! $this->hasStrongWordStructure($name, $coreName);
104107

105108
return [
106109
'coreName' => $coreName,
@@ -226,6 +229,10 @@ protected function checkLowSignal(string $name): ?CategorizationResult
226229
{
227230
$analysis = $this->inspectSignals($name);
228231

232+
if ($this->hasStrongWordStructure($name, $analysis['coreName'])) {
233+
return null;
234+
}
235+
229236
if ($this->isZeroVowelLongToken($analysis['coreName'])) {
230237
return null;
231238
}
@@ -246,6 +253,23 @@ protected function checkLowSignal(string $name): ?CategorizationResult
246253
return null;
247254
}
248255

256+
protected function hasStrongWordStructure(string $name, string $coreName): bool
257+
{
258+
return $this->getMaxConsecutiveLetters($coreName) >= 5
259+
&& $this->hasNormalVowelRatio($coreName)
260+
&& $this->countAlphabeticWordTokens($name) >= 2;
261+
}
262+
263+
protected function countAlphabeticWordTokens(string $name): int
264+
{
265+
$tokens = preg_split('/[.\s_-]+/', $this->stripExtensionsForAnalysis($name)) ?: [];
266+
267+
return count(array_filter(
268+
$tokens,
269+
static fn (string $token): bool => preg_match('/[a-z]{3,}/i', $token) === 1
270+
));
271+
}
272+
249273
protected function checkArchive(string $name): ?CategorizationResult
250274
{
251275
if (preg_match('/\.(zip|rar|7z|tar|gz|bz2|xz|tgz|tbz2|cab|iso|img|dmg|pkg|archive)$/i', $name)) {

app/Services/Categorization/Categorizers/PcCategorizer.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ protected function checkISO(string $name): ?CategorizationResult
148148

149149
protected function check0day(string $name): ?CategorizationResult
150150
{
151+
if (preg_match('/\.(msix|msixbundle|appx|appxbundle|msi)$/i', $name)) {
152+
return $this->matched(Category::PC_0DAY, 0.9, '0day_msix_installer');
153+
}
154+
151155
// Explicit 0day indicators
152156
if (preg_match('/[._ -]exe$|[._ -](utorrent|Virtualbox)[._ -]|\b0DAY\b|incl.+crack| DRM$|>DRM</i', $name)) {
153157
return $this->matched(Category::PC_0DAY, 0.9, '0day_explicit');

app/Traits/DetectsHashedNames.php

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,23 @@ protected function isZeroVowelLongToken(string $coreName, int $minLength = 20):
264264
return preg_match('/[aeiou]/i', $letterOnly) !== 1;
265265
}
266266

267+
/**
268+
* Check whether the letter-only portion of a token has a normal vowel ratio.
269+
*/
270+
protected function hasNormalVowelRatio(string $str, float $min = 0.2): bool
271+
{
272+
$letters = preg_replace('/[^a-z]/i', '', $str);
273+
274+
if ($letters === '' || strlen($letters) < 5) {
275+
return false;
276+
}
277+
278+
preg_match_all('/[aeiou]/i', $letters, $matches);
279+
$vowelCount = count($matches[0]);
280+
281+
return ($vowelCount / strlen($letters)) >= $min;
282+
}
283+
267284
/**
268285
* Check if a string looks like a random/obfuscated string rather than a real title.
269286
*
@@ -315,7 +332,7 @@ protected function looksLikeRandomString(string $str): bool
315332
protected function stripExtensionsForAnalysis(string $name): string
316333
{
317334
return preg_replace(
318-
'/\.(mkv|avi|mp4|m4v|mpg|mpeg|wmv|flv|mov|ts|vob|iso|divx|par2?|nfo|sfv|nzb|rar|r\d{2,3}|zip|7z|gz|tar|001)$/i',
335+
'/\.(mkv|avi|mp4|m4v|mpg|mpeg|wmv|flv|mov|ts|vob|iso|divx|par2?|nfo|sfv|nzb|rar|r\d{2,3}|zip|7z|gz|tar|001|msix|msixbundle|appx|appxbundle|apk|xap|ipa|deb|rpm|pkg|dmg|exe|msi)$/i',
319336
'',
320337
trim($name)
321338
);

tests/Unit/CategorizePcGameTest.php

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,23 @@
77
use App\Models\Category;
88
use App\Services\Categorization\Categorizers\PcCategorizer;
99
use App\Services\Categorization\ReleaseContext;
10+
use PHPUnit\Framework\Attributes\DataProvider;
1011
use PHPUnit\Framework\TestCase;
1112

1213
class CategorizePcGameTest extends TestCase
1314
{
15+
/**
16+
* @return array<string, array{0: string}>
17+
*/
18+
public static function installerPackageProvider(): array
19+
{
20+
return [
21+
'msix bundle' => ['Adobe Express Photos.Msixbundle'],
22+
'appx bundle' => ['Microsoft.To.Do.appxbundle'],
23+
'msi package' => ['Utility Installer.msi'],
24+
];
25+
}
26+
1427
private PcCategorizer $categorizer;
1528

1629
protected function setUp(): void
@@ -96,4 +109,21 @@ public function test_console_and_mac_not_misclassified_as_pc_game(): void
96109
}
97110
}
98111
}
112+
113+
#[DataProvider('installerPackageProvider')]
114+
public function test_installer_packages_are_classified_as_pc_0day(string $name): void
115+
{
116+
$context = new ReleaseContext(
117+
releaseName: $name,
118+
groupId: 0,
119+
groupName: '',
120+
poster: ''
121+
);
122+
123+
$result = $this->categorizer->categorize($context);
124+
125+
$this->assertTrue($result->isSuccessful(), "Expected PC 0day match for installer package: $name");
126+
$this->assertSame(Category::PC_0DAY, $result->categoryId, "Expected PC_0DAY for installer package: $name");
127+
$this->assertSame('0day_msix_installer', $result->matchedBy);
128+
}
99129
}

tests/Unit/HashedReleaseCategorizationTest.php

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,21 @@ public static function legitimateNamesProvider(): array
132132
'TV episode' => ['Show.Name.S03E05.720p.HDTV.x264-GROUP', 'alt.binaries.hdtv'],
133133
'Music album' => ['Artist.Name-Album.Title-2024-FLAC-GROUP', 'alt.binaries.sounds.mp3'],
134134
'Game release' => ['Starfield-RUNE', 'alt.binaries.games'],
135+
'Readable software package' => ['Microsoft Office Suite Installer', 'alt.binaries.warez'],
136+
'Adobe msix bundle' => ['Adobe Express Photos.Msixbundle', 'alt.binaries.erotica.divx'],
137+
];
138+
}
139+
140+
/**
141+
* Readable software-like names that must not be classified as hashed by the misc categorizer.
142+
*
143+
* @return array<string, array{0: string}>
144+
*/
145+
public static function readableSoftwareNamesProvider(): array
146+
{
147+
return [
148+
'Adobe msix bundle' => ['Adobe Express Photos.Msixbundle'],
149+
'Office installer words' => ['Microsoft Office Suite Installer'],
135150
];
136151
}
137152

@@ -202,6 +217,18 @@ public function test_misc_categorizer_detects_gibberish_names(string $name, stri
202217
$this->assertSame(Category::OTHER_HASHED, $result->categoryId, "Expected OTHER_HASHED for: $name");
203218
}
204219

220+
#[DataProvider('readableSoftwareNamesProvider')]
221+
public function test_misc_categorizer_does_not_hash_readable_software_names(string $name): void
222+
{
223+
$categorizer = new MiscCategorizer;
224+
$context = new ReleaseContext(releaseName: $name, groupId: 0);
225+
$result = $categorizer->categorize($context);
226+
227+
$this->assertFalse($result->isSuccessful(), "Readable software name '$name' should not be matched by misc hash heuristics");
228+
$this->assertSame(Category::OTHER_MISC, $result->categoryId);
229+
$this->assertSame('no_match', $result->matchedBy);
230+
}
231+
205232
// ------------------------------------------------------------------
206233
// Tests: MiscPipe lock mechanism
207234
// ------------------------------------------------------------------
@@ -289,6 +316,15 @@ public function test_legitimate_releases_are_not_locked(string $name, string $gr
289316
);
290317
}
291318

319+
public function test_adobe_msix_bundle_reaches_pc_0day_in_full_pipeline(): void
320+
{
321+
$passable = $this->runPipeline('Adobe Express Photos.Msixbundle', 'alt.binaries.erotica.divx');
322+
323+
$this->assertFalse($passable->lockedToMisc);
324+
$this->assertSame(Category::PC_0DAY, $passable->bestResult->categoryId);
325+
$this->assertSame('0day_msix_installer', $passable->bestResult->matchedBy);
326+
}
327+
292328
// ------------------------------------------------------------------
293329
// Tests: shouldStopProcessing() respects the lock
294330
// ------------------------------------------------------------------

0 commit comments

Comments
 (0)