Skip to content

Commit 93b3bfa

Browse files
authored
Merge pull request #62 from piatrashkakanstantinass/filtration
Filtration support
2 parents 4a799df + 95566f0 commit 93b3bfa

13 files changed

+406
-35
lines changed

README.md

+2
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ Or if you come across some piece of text and you don't know what it is, `What` w
5353

5454
**File Opening** You can pass in a file path by `what "this/is/a/file/path"`. What is smart enough to figure out it's a file!
5555

56+
**Filtration** You can filter output by using `what --rarity 0.2:0.8 --include_tags tag1,tag2 TEXT`. Use `what --help` to get more information.
57+
5658
# 🍕 API
5759

5860
PyWhat has an API! Click here [https://github.com/bee-san/pyWhat/wiki/API](https://github.com/bee-san/pyWhat/wiki/API) to read about it.

pywhat/Data/regex.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@
424424
"Regex": "^[0-9]{3}-[0-9]{2}-[0-9]{4}$",
425425
"plural_name": false,
426426
"Description": "An [#CAE4F1][link=https://en.wikipedia.org/wiki/Social_Security_number]American Identification Number[/link][/#CAE4F1]",
427-
"rarity": 0.2,
427+
"Rarity": 0.2,
428428
"Tags": [
429429
"Credentials",
430430
"Password",

pywhat/__init__.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from pywhat.distribution import Distribution
2+
from pywhat.helper import AvailableTags
3+
from pywhat.identifier import Identifier
4+
5+
pywhat_tags = AvailableTags().get_tags()
6+
7+
8+
__all__ = ["Identifier", "Distribution", "pywhat_tags"]

pywhat/distribution.py

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from typing import Optional
2+
3+
from pywhat.helper import AvailableTags, CaseInsensitiveSet, InvalidTag, load_regexes
4+
5+
6+
class Distribution:
7+
"""
8+
A distribution is an object containing the regex
9+
But the regex has gone through a filter process.
10+
11+
Example filters:
12+
* {"Tags": ["Networking"]}
13+
* {"Tags": ["Identifiers"], "ExcludeTags": ["Credentials"], "MinRarity": 0.6}
14+
"""
15+
16+
def __init__(self, filters_dict: Optional[dict] = None):
17+
tags = CaseInsensitiveSet(AvailableTags().get_tags())
18+
self._dict = dict()
19+
if filters_dict is None:
20+
filters_dict = dict()
21+
22+
self._dict["Tags"] = CaseInsensitiveSet(filters_dict.setdefault("Tags", tags))
23+
self._dict["ExcludeTags"] = CaseInsensitiveSet(filters_dict.setdefault("ExcludeTags", set()))
24+
self._dict["MinRarity"] = filters_dict.setdefault("MinRarity", 0)
25+
self._dict["MaxRarity"] = filters_dict.setdefault("MaxRarity", 1)
26+
if not self._dict["Tags"].issubset(tags) or not self._dict["ExcludeTags"].issubset(tags):
27+
raise InvalidTag("Passed filter contains tags that are not used by 'what'")
28+
29+
self._regexes = load_regexes()
30+
self._filter()
31+
32+
def _filter(self):
33+
temp_regexes = []
34+
min_rarity = self._dict["MinRarity"]
35+
max_rarity = self._dict["MaxRarity"]
36+
for regex in self._regexes:
37+
if (
38+
min_rarity <= regex["Rarity"] <= max_rarity
39+
and set(regex["Tags"]) & self._dict["Tags"]
40+
and not set(regex["Tags"]) & self._dict["ExcludeTags"]
41+
):
42+
temp_regexes.append(regex)
43+
44+
self._regexes = temp_regexes
45+
46+
def get_regexes(self):
47+
return list(self._regexes)
48+
49+
def get_filter(self):
50+
return dict(self._dict)
51+
52+
def __repr__(self):
53+
return f"Distribution({self._dict})"
54+
55+
def __and__(self, other):
56+
if type(self) != type(other):
57+
return NotImplemented
58+
tags = self._dict["Tags"] & other._dict["Tags"]
59+
exclude_tags = self._dict["ExcludeTags"] & other._dict["ExcludeTags"]
60+
min_rarity = max(self._dict["MinRarity"], other._dict["MinRarity"])
61+
max_rarity = min(self._dict["MaxRarity"], other._dict["MaxRarity"])
62+
return Distribution(
63+
{"Tags": tags, "ExcludeTags": exclude_tags,
64+
"MinRarity": min_rarity, "MaxRarity": max_rarity})
65+
66+
def __or__(self, other):
67+
if type(self) != type(other):
68+
return NotImplemented
69+
tags = self._dict["Tags"] | other._dict["Tags"]
70+
exclude_tags = self._dict["ExcludeTags"] | other._dict["ExcludeTags"]
71+
min_rarity = min(self._dict["MinRarity"], other._dict["MinRarity"])
72+
max_rarity = max(self._dict["MaxRarity"], other._dict["MaxRarity"])
73+
return Distribution(
74+
{"Tags": tags, "ExcludeTags": exclude_tags,
75+
"MinRarity": min_rarity, "MaxRarity": max_rarity})
76+
77+
78+
def __iand__(self, other):
79+
if type(self) != type(other):
80+
return NotImplemented
81+
return self & other
82+
83+
def __ior__(self, other):
84+
if type(self) != type(other):
85+
return NotImplemented
86+
return self | other

pywhat/helper.py

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
"""Helper utilities"""
2+
import collections.abc
3+
import json
4+
import os.path
5+
6+
7+
class AvailableTags:
8+
def __init__(self):
9+
self.tags = set()
10+
regexes = load_regexes()
11+
for regex in regexes:
12+
self.tags.update(regex["Tags"])
13+
14+
def get_tags(self):
15+
return self.tags
16+
17+
18+
class InvalidTag(Exception):
19+
"""
20+
This exception should be raised when Distribution() gets a filter
21+
containing non-existent tags.
22+
"""
23+
24+
pass
25+
26+
27+
def load_regexes() -> list:
28+
path = "Data/regex.json"
29+
fullpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
30+
with open(fullpath, "r", encoding="utf-8") as myfile:
31+
return json.load(myfile)
32+
33+
34+
class CaseInsensitiveSet(collections.abc.Set):
35+
def __init__(self, iterable=None):
36+
self._elements = set()
37+
if iterable is not None:
38+
self._elements = set(map(self._lower, iterable))
39+
40+
def _lower(self, value):
41+
return value.lower() if isinstance(value, str) else value
42+
43+
def __contains__(self, value):
44+
return self._lower(value) in self._elements
45+
46+
def __iter__(self):
47+
return iter(self._elements)
48+
49+
def __len__(self):
50+
return len(self._elements)
51+
52+
def __repr__(self):
53+
return self._elements.__repr__()
54+
55+
def issubset(self, other):
56+
for value in self:
57+
if value not in other:
58+
return False
59+
return True

pywhat/identifier.py

+24-13
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,47 @@
11
import os.path
2+
from typing import List, Optional
23

4+
from pywhat.distribution import Distribution
35
from pywhat.magic_numbers import FileSignatures
46
from pywhat.nameThatHash import Nth
57
from pywhat.regex_identifier import RegexIdentifier
68

79

810
class Identifier:
9-
def __init__(self):
10-
self.regex_id = RegexIdentifier()
11-
self.file_sig = FileSignatures()
12-
self.name_that_hash = Nth()
13-
14-
def identify(self, text: str, api=False) -> dict:
11+
def __init__(self, distribution: Optional[Distribution] = None):
12+
if distribution is None:
13+
self.distribution = Distribution()
14+
else:
15+
self.distribution = distribution
16+
self._regex_id = RegexIdentifier()
17+
self._file_sig = FileSignatures()
18+
self._name_that_hash = Nth()
19+
20+
def identify(self, text: str, dist: Distribution = None,
21+
api=False) -> dict:
22+
if dist is None:
23+
dist = self.distribution
1524
identify_obj = {}
1625

1726
magic_numbers = None
18-
if not api and self.file_exists(text):
19-
magic_numbers = self.file_sig.open_binary_scan_magic_nums(text)
20-
text = self.file_sig.open_file_loc(text)
27+
if not api and self._file_exists(text):
28+
magic_numbers = self._file_sig.open_binary_scan_magic_nums(text)
29+
text = self._file_sig.open_file_loc(text)
2130
identify_obj["File Signatures"] = magic_numbers
2231
else:
2332
text = [text]
2433

2534
if not magic_numbers:
2635
# If file doesn't exist, check to see if the inputted text is
2736
# a file in hex format
28-
identify_obj["File Signatures"] = self.file_sig.check_magic_nums(text)
29-
identify_obj["Regexes"] = self.regex_id.check(text)
37+
identify_obj["File Signatures"] = self._file_sig.check_magic_nums(text)
38+
39+
identify_obj["Regexes"] = self._regex_id.check(text, dist)
40+
3041
# get_hashes takes a list of hashes, we split to give it a list
31-
# identify_obj["Hashes"] = self.name_that_hash.get_hashes(text.split())
42+
# identify_obj["Hashes"] = self._name_that_hash.get_hashes(text.split())
3243

3344
return identify_obj
3445

35-
def file_exists(self, text):
46+
def _file_exists(self, text):
3647
return os.path.isfile(text)

pywhat/printer.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,11 @@ def pretty_print(self, text: dict):
1010

1111
to_out = ""
1212

13-
if text["File Signatures"]:
13+
if text["File Signatures"] and text["Regexes"]:
1414
to_out += "\n"
1515
to_out += f"[bold #D7Afff]File Identified[/bold #D7Afff] with Magic Numbers {text['File Signatures']['ISO 8859-1']}."
1616
to_out += f"\n[bold #D7Afff]File Description:[/bold #D7Afff] {text['File Signatures']['Description']}."
1717
to_out += "\n"
18-
if to_out:
19-
console.print(to_out)
2018

2119
if text["Regexes"]:
2220
to_out += "\n[bold #D7Afff]Possible Identification[/bold #D7Afff]"

pywhat/regex_identifier.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,21 @@
22
import json
33
import os
44
import re
5+
from typing import Optional
6+
7+
from pywhat.distribution import Distribution
58

69

710
class RegexIdentifier:
811
def __init__(self):
9-
path = "Data/regex.json"
10-
fullpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), path)
11-
with open(fullpath, "r", encoding="utf8") as myfile:
12-
self.regexes = json.load(myfile)
12+
self.distribution = Distribution()
1313

14-
def check(self, text):
14+
def check(self, text, distribution: Optional[Distribution] = None):
15+
if distribution is None:
16+
distribution = self.distribution
1517
matches = []
1618
for txt in text:
17-
for reg in self.regexes:
19+
for reg in distribution.get_regexes():
1820
matched_regex = re.search(reg["Regex"], txt, re.UNICODE)
1921

2022
if matched_regex:
@@ -28,8 +30,8 @@ def check(self, text):
2830
codes_path = "Data/phone_codes.json"
2931
codes_fullpath = os.path.join(
3032
os.path.dirname(os.path.abspath(__file__)), codes_path)
31-
with open(codes_fullpath) as f:
32-
codes = json.load(f)
33+
with open(codes_fullpath, "r", encoding="utf-8") as myfile:
34+
codes = json.load(myfile)
3335

3436
locations = []
3537
for code in codes:

pywhat/what.py

+66-5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,49 @@
1+
import sys
2+
13
import click
4+
from rich.console import Console
5+
26
from pywhat import identifier, printer
7+
from pywhat.distribution import Distribution
8+
from pywhat.helper import AvailableTags, InvalidTag
9+
10+
11+
def print_tags(ctx, opts, value):
12+
if value:
13+
tags = sorted(AvailableTags().get_tags())
14+
console = Console()
15+
console.print("[bold #D7Afff]" + "\n".join(tags) + "[/bold #D7Afff]")
16+
sys.exit()
17+
18+
19+
def parse_options(rarity, include_tags, exclude_tags):
20+
filter = dict()
21+
if rarity is not None:
22+
rarities = rarity.split(":")
23+
if len(rarities) != 2:
24+
print("Invalid rarity range format ('min:max' expected)")
25+
sys.exit(1)
26+
try:
27+
if not rarities[0].isspace() and rarities[0]:
28+
filter["MinRarity"] = float(rarities[0])
29+
if not rarities[1].isspace() and rarities[1]:
30+
filter["MaxRarity"] = float(rarities[1])
31+
except ValueError:
32+
print("Invalid rarity argument (float expected)")
33+
sys.exit(1)
34+
if include_tags is not None:
35+
filter["Tags"] = list(map(str.strip, include_tags.split(',')))
36+
if exclude_tags is not None:
37+
filter["ExcludeTags"] = list(map(str.strip, exclude_tags.split(',')))
38+
39+
try:
40+
distribution = Distribution(filter)
41+
except InvalidTag:
42+
print("Passed tags are not valid.\n" \
43+
"You can check available tags by using: 'pywhat --tags'")
44+
sys.exit(1)
45+
46+
return distribution
347

448

549
@click.command(
@@ -8,14 +52,26 @@
852
)
953
)
1054
@click.argument("text_input", required=True)
11-
def main(text_input):
55+
@click.option("-t", "--tags", is_flag=True, expose_value=False, callback=print_tags, help="Show available tags and exit.")
56+
@click.option("-r", "--rarity", help="Filter by rarity. This is in the range of 0:1. To filter only items past 0.5, use 0.5: with the colon on the end.")
57+
@click.option("-i", "--include_tags", help="Only print entries with included tags.")
58+
@click.option("-e", "--exclude_tags", help="Exclude tags.")
59+
def main(text_input, rarity, include_tags, exclude_tags):
1260
"""
1361
What - Identify what something is.\n
1462
1563
Made by Bee https://twitter.com/bee_sec_san\n
1664
1765
https://github.com/bee-san\n
1866
67+
Filtration:\n
68+
--rarity min:max\n
69+
Only print entries with rarity in range [min,max]. min and max can be omitted.\n
70+
--include_tags list\n
71+
Only include entries containing at least one tag in a list. List is a comma separated list.\n
72+
--include_tags list\n
73+
Exclude specified tags. List is a comma separated list.\n
74+
1975
Examples:
2076
2177
* what "HTB{this is a flag}"
@@ -24,22 +80,27 @@ def main(text_input):
2480
2581
* what -- 52.6169586, -1.9779857
2682
83+
* what --rarity 0.6: [email protected]
84+
2785
Your text must either be in quotation marks, or use the POSIX standard of "--" to mean "anything after -- is textual input".
2886
2987
"""
3088

31-
what_obj = What_Object()
89+
what_obj = What_Object(
90+
parse_options(rarity, include_tags, exclude_tags)
91+
)
3292
identified_output = what_obj.what_is_this(text_input)
3393

3494
p = printer.Printing()
3595
p.pretty_print(identified_output)
3696

3797

3898
class What_Object:
39-
def __init__(self):
40-
self.id = identifier.Identifier()
99+
def __init__(self, distribution):
100+
self.id = identifier.Identifier(distribution)
41101

42-
def what_is_this(self, text: str) -> dict:
102+
def what_is_this(
103+
self, text: str) -> dict:
43104
"""
44105
Returns a Python dictionary of everything that has been identified
45106
"""

0 commit comments

Comments
 (0)