Skip to content

Commit 113cbd5

Browse files
joshhvulcanclaude
andcommitted
Anchor net buoy-name patterns with \b to stop matching real vessels
The net patterns (net\d+, net\s+\w+, Net fish, NetFish, NET MARK) had no leading word boundary, so "net" mid-word matched real vessel names — SIGNET ARCTURUS, GANNET S, PLANET OCEAN, GARNET STAR, MADINET BENI-SAF, JEANET MAARTJE — flipping them to buoy. Anchor each with \b so "net" must start a word. Genuine buoy names (NET 1, NET10, NETFISH..., NET MARK) are unaffected since "net" starts a word there. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent ef9743f commit 113cbd5

2 files changed

Lines changed: 48 additions & 7 deletions

File tree

ais/src/atlantes/machine_annotation/data_annotate_utils.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,11 @@ def get_ais_vessel_category(activity_descs: Optional[list[str]]) -> list[int]:
8888
# Common ways of naming a buoy on AIS generalized from anecdotal observations.
8989
#
9090
# These patterns are matched case-insensitively against entity names. They cover:
91-
# - Net identifiers: "net\d+" (NET10), "net\s+\w+" (NET 1, NET D)
91+
# - Net identifiers: "\bnet\d+" (NET10), "\bnet\s+\w+" (NET 1, NET D).
92+
# The leading "\b" requires "net" to start a word, so mid-word matches in real
93+
# vessel names (SIGNET, GANNET, PLANET, GARNET, MADINET, JEANET...) are NOT
94+
# treated as buoys. The same "\b" anchor is applied to the explicit gear names
95+
# below for the same reason.
9296
# - Chinese fishing gear suffixes: "yu\s*\d+-\d+" (MINPINGYU63036-1, ZHE DAI YU 04455-44).
9397
# Chinese fishing vessels use province+"YU"+registration (e.g. MINPINGYU63036).
9498
# A -N suffix indicates individual nets/gear, not the vessel itself.
@@ -110,7 +114,7 @@ def get_ais_vessel_category(activity_descs: Optional[list[str]]) -> list[int]:
110114
# Gear IDs frequently use double-dashes between numeric fields. Validated
111115
# against VHS: 1,135 names not already caught by other patterns, of which
112116
# 50.5% are GFW GEAR and only 3% FISHING.
113-
# - Explicit gear names: "fishing gear", "Net fish", "NetFish", "NET MARK"
117+
# - Explicit gear names: "fishing gear", "\bNet fish", "\bNetFish", "\bNET MARK"
114118
# - Numeric gear IDs: "\d{3,5} \d+ \d+" (e.g. "02333 287 82", "1638 48 90",
115119
# "78000 35 99"). Three-to-five-digit ID followed by space-separated numeric
116120
# fields (likely channel/signal and battery level without %). Observed in large
@@ -126,18 +130,18 @@ def get_ais_vessel_category(activity_descs: Optional[list[str]]) -> list[int]:
126130
# - Bracket voltage ("[8.0V]", "[7.1V]"): very rare in VHS data. Could revisit
127131
# if more examples surface.
128132
NAME_PATTERNS_FOR_BUOYS = [
129-
r"net\d+",
130-
r"net\s+\w+",
133+
r"\bnet\d+",
134+
r"\bnet\s+\w+",
131135
r"yu\s*\d+-\d+",
132136
r"fishing gear",
133137
r"\d+%",
134138
r"\d+V\d+",
135139
r"\d+\.\d+V",
136140
r"\d\s+\d+V\b",
137141
r"\d+--\d+",
138-
r"Net fish",
139-
r"NetFish",
140-
r"NET MARK",
142+
r"\bNet fish",
143+
r"\bNetFish",
144+
r"\bNET MARK",
141145
r"\d{3,5} \d+ \d+",
142146
]
143147

ais/tests/unit/inference/atlas_entity/test_unit_entity_postprocessor.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,43 @@ def test_postprocess_net_space_identifier_classified_as_buoy(
531531
assert output.entity_class == "buoy"
532532
assert output.entity_classification_details.postprocess_rule_applied is True
533533

534+
@pytest.mark.parametrize(
535+
"entity_name",
536+
[
537+
"SIGNET ARCTURUS",
538+
"GANNET S",
539+
"PLANET OCEAN",
540+
"GARNET STAR",
541+
"MADINET BENI-SAF",
542+
"JEANET MAARTJE",
543+
],
544+
)
545+
def test_postprocess_midword_net_not_classified_as_buoy(
546+
self,
547+
entity_name: str,
548+
entity_postprocessor_class: AtlasEntityPostProcessor,
549+
) -> None:
550+
"""Real vessels with 'net' mid-word must not match the net buoy patterns."""
551+
input_data = EntityPostprocessorInput(
552+
predicted_class=AtlasEntityLabelsTrainingWithUnknown.VESSEL,
553+
entity_classification_details=EntityPostprocessorInputDetails(
554+
model="test", confidence=0.9, outputs=[0.9, 0.1]
555+
),
556+
metadata=EntityMetadata(
557+
binned_ship_type=0,
558+
ais_type=9999,
559+
mmsi="123456789",
560+
entity_name=entity_name,
561+
track_length=800,
562+
file_location=None,
563+
trackId="A:123456789",
564+
flag_code="USA",
565+
),
566+
)
567+
output = entity_postprocessor_class.postprocess(input_data)
568+
assert output.entity_class == "vessel"
569+
assert output.entity_classification_details.postprocess_rule_applied is False
570+
534571
@pytest.mark.parametrize(
535572
"entity_name",
536573
[

0 commit comments

Comments
 (0)