address-batch-geocoder/geocoder.py at main · CityOfPhiladelphia/address-batch-geocoder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
import os

# Set thread pool size to 1 to avoid API rate limits
# This needs to be set before polars is imported
# os.environ["POLARS_MAX_THREADS"] = "1"

import yaml
import polars as pl
import requests
import click
import os
import tempfile
from datetime import datetime
from functools import partial
from utils.encoder import detect_file_encoding, recode_to_utf8
from utils.parse_address import (
    find_address_fields,
    parse_address,
    infer_city_state_field,
    is_non_philly_from_full_address,
    is_non_philly_from_split_address,
)
from utils.ais_lookup import ais_lookup
from utils.tomtom_lookup import tomtom_lookup
from utils.zips import ZIPS
from mapping.ais_properties_fields import POSSIBLE_FIELDS
from passyunk.parser import PassyunkParser
from pathlib import PurePath


def get_current_time():
    current_datetime = datetime.now()
    return current_datetime.strftime("%H:%M:%S")


def split_non_philly_address(config, lf: pl.LazyFrame) -> pl.LazyFrame:
    """
    Given a polars LazyFrame, splits into two lazy frames:
    One for addresses located in Philadelphia, one for addresses
    not located in Philadelphia.

    Returns:
        (philly_lf, non_philly_lf)
    """

    fields = infer_city_state_field(config)

    # If we are using full address field, we need to look up
    # against us-address.
    full_address_field = fields.get("full_address")

    location_struct = pl.Struct(
        [pl.Field("is_non_philly", pl.Boolean), pl.Field("is_undefined", pl.Boolean)]
    )

    if full_address_field:
        non_philly_fn = partial(is_non_philly_from_full_address, philly_zips=ZIPS)

        flagged = lf.with_columns(
            pl.col(full_address_field)
            .map_elements(non_philly_fn, return_dtype=location_struct)
            .alias("location_info")
        ).unnest("location_info")

    # Otherwise, get address columns from config
    else:
        city_col = fields.get("city")
        state_col = fields.get("state")
        zip_col = fields.get("zip")

        # Make an address struct based on which fields exist
        address_struct = pl.struct(
            [
                (pl.col(city_col) if city_col else pl.lit(None, dtype=pl.Utf8)).alias(
                    "city"
                ),
                (pl.col(state_col) if state_col else pl.lit(None, dtype=pl.Utf8)).alias(
                    "state"
                ),
                (pl.col(zip_col) if zip_col else pl.lit(None, dtype=pl.Utf8)).alias(
                    "zip"
                ),
            ]
        )

        # Partial helper function for searching for non philly records
        # used for mapping with polars
        non_philly_fn = partial(is_non_philly_from_split_address, zips=ZIPS)

        flagged = lf.with_columns(
            address_struct.map_elements(
                non_philly_fn, return_dtype=location_struct
            ).alias("location_info")
        ).unnest("location_info")

    non_philly_lf = flagged.filter(pl.col("is_non_philly"))
    philly_lf = flagged.filter(~pl.col("is_non_philly"))

    return philly_lf, non_philly_lf


def parse_with_passyunk_parser(
    parser, address_col: str, lf: pl.LazyFrame
) -> pl.LazyFrame:
    """
    Given a polars LazyFrame, parses addresses in that LazyFrame
    using passyunk parser, and adds output address.

    Args:
        parser: A passyunk parser instance
        address_col: The address column to parse
        lf: The polars lazyframe with an address field to parse

    Returns:
        A polars lazyframe with output address, and address validity booleans
        added.
    """

    # Create struct of columns to be filled by parse address function
    new_cols = pl.Struct(
        [
            pl.Field("output_address", pl.String),
            pl.Field("is_addr", pl.Boolean),
            pl.Field("is_philly_addr", pl.Boolean),
            pl.Field("is_multiple_match", pl.Boolean),
            pl.Field("geocoder_used", pl.String),
        ]
    )

    lf = lf.with_columns(
        pl.col(address_col)
        .map_elements(lambda s: parse_address(parser, s), return_dtype=new_cols)
        .alias("passyunk_struct")
    ).unnest("passyunk_struct")

    return lf


def build_enrichment_fields(config: dict) -> tuple[list, list]:
    """
    Given a config dictionary, returns two lists of fields to be
    added to the input file. One list is the address file fieldnames,
    the other is the AIS fieldnames.

    Args:
        config (dict): A dictionary read from the config yaml file

    Returns: A tuple with AIS fieldnames and address file fieldnames.
    """
    ais_enrichment_fields = config["enrichment_fields"]

    invalid_fields = [
        item for item in ais_enrichment_fields if item not in POSSIBLE_FIELDS.keys()
    ]

    if invalid_fields:
        to_print = ", ".join(field for field in invalid_fields)
        raise ValueError(
            "The following fields are not available:"
            f"{to_print}. Please correct these and try again."
        )

    address_file_fields = []

    [
        address_file_fields.append(POSSIBLE_FIELDS[item])
        for item in ais_enrichment_fields
    ]

    # Need street_address for joining
    address_file_fields.append("street_address")

    # Add coordinate fields based on config
    srid_4326 = config.get("srid_4326")
    srid_2272 = config.get("srid_2272")

    if srid_4326:
        address_file_fields.extend(["geocode_lat", "geocode_lon"])
    if srid_2272:
        address_file_fields.extend(["geocode_x", "geocode_y"])

    # Avoid issues if user specifies a field more than once
    return (set(ais_enrichment_fields), set(address_file_fields))


def add_address_file_fields(
    geo_filepath: str, input_data: pl.LazyFrame, address_fields: list, config: dict
) -> pl.LazyFrame:
    """
    Given a list of address fields to add, adds those fields from
    the address file to each record in the input data. Does so via a
    left join on the full address.

    Args:
        geo_filepath: The filepath to the geography file. This is the main
        file used to geocode addresses.
        input_data: A lazyframe containing the input data to be enriched
        address_fields: A list of one or more address fields
    """
    addresses = pl.scan_parquet(geo_filepath)
    addresses = addresses.select(address_fields)

    # Check which enrichment fields would conflict with existing columns
    existing_cols = input_data.collect_schema().names()
    enrichment_col_names = [
        key for key, value in POSSIBLE_FIELDS.items() if value in address_fields
    ]
    conflicts = [field for field in enrichment_col_names if field in existing_cols]

    # Rename conflicting input columns to _left
    if conflicts:
        rename_input = {field: field + "_left" for field in conflicts}
        input_data = input_data.rename(rename_input)

    rename_mapping = {
        value: key for key, value in POSSIBLE_FIELDS.items() if value in address_fields
    }

    joined_lf = input_data.join(
        addresses, how="left", left_on="output_address", right_on="street_address"
    ).rename(rename_mapping)

    # Mark match type as address_file if we got coordinates from the file
    # Check whichever SRID is enabled
    srid_4326 = config.get("srid_4326")
    srid_2272 = config.get("srid_2272")

    if srid_4326:
        match_condition = pl.col("geocode_lat").is_not_null()
    elif srid_2272:
        match_condition = pl.col("geocode_x").is_not_null()
    else:
        # This shouldn't happen due to earlier validation, but just in case
        raise ValueError("At least one SRID must be enabled")

    joined_lf = joined_lf.with_columns(
        pl.when(match_condition)
        .then(pl.lit("address_file"))
        .otherwise("geocoder_used")
        .alias("geocoder_used")
    )

    return joined_lf


def split_geos(data: pl.LazyFrame, config: dict):
    """
    Splits a lazyframe into two lazy frames: one for records with latitude
    and longitude, and another for records without latitude and longitude.
    Used to determine which records need to be added using AIS.
    """

    srid_4326 = config.get("srid_4326")
    srid_2272 = config.get("srid_2272")

    if srid_4326:
        has_geo = data.filter(
            (~pl.col("geocode_lat").is_null()) & (~pl.col("geocode_lon").is_null())
        )
        needs_geo = data.filter(
            (pl.col("geocode_lat").is_null()) | (pl.col("geocode_lon").is_null())
        )

    elif srid_2272:
        has_geo = data.filter(
            (~pl.col("geocode_x").is_null()) & (~pl.col("geocode_y").is_null())
        )
        needs_geo = data.filter(
            (pl.col("geocode_x").is_null()) | (pl.col("geocode_y").is_null())
        )

    else:
        raise ValueError("Either SRID 4326 or SRID 2272 must be specified.")

    return (has_geo, needs_geo)


def enrich_with_ais(
    config: dict,
    to_add: pl.LazyFrame,
    full_address_field: bool,
    enrichment_fields: list,
) -> pl.LazyFrame:
    """
    Enrich a lazyframe with user-specified columns from AIS.

    Args:
        config (dict): A dictionary of config information. Used
        to make API calls.
        to_add (polars LazyFrame): A lazyframe of data to enrich
        full_address_field (bool): Whether or not the user has specified
        that the input data has a full address field
        enrichment_fields: A list of fields to add to the lazyframe.
    """

    # Created augmented address for undefined locations
    to_add = to_add.with_columns(
        pl.when(pl.col("is_undefined"))
        .then(pl.concat_str([pl.col("output_address"), pl.lit(", Philadelphia, PA")]))
        .otherwise(pl.col("output_address"))
        .alias("api_address")
    )

    # Build struct based on config
    srid_4326 = config.get("srid_4326")
    srid_2272 = config.get("srid_2272")

    struct_fields = [
    pl.Field("output_address", pl.String),
    pl.Field("is_addr", pl.Boolean),
    pl.Field("is_philly_addr", pl.Boolean),
    pl.Field("is_multiple_match", pl.Boolean),
    pl.Field("geocoder_used", pl.String),
]

    if srid_4326:
        struct_fields.extend([
            pl.Field("geocode_lat", pl.String),
            pl.Field("geocode_lon", pl.String)
        ])

    if srid_2272:
        struct_fields.extend([
            pl.Field("geocode_x", pl.String),
            pl.Field("geocode_y", pl.String),
        ])

    struct_fields.extend([
        *[pl.Field(field, pl.String) for field in enrichment_fields]
    ])

    new_cols = pl.Struct(struct_fields)

    API_KEY = config.get("AIS_API_KEY")
    field_names = [f.name for f in new_cols.fields]

    with requests.Session() as sess:
        addr_cfg = config.get("address_fields") or {}
        zip_field = addr_cfg.get("zip")

        # Don't include zip field if full address field is specified
        # Use API address to account for cases where we must
        # assume that address is in Philadelphia
        if zip_field and not full_address_field:
            struct_expr = pl.struct(
                [
                    "api_address",
                    "output_address",
                    zip_field,
                    "is_addr",
                    "is_philly_addr",
                ]
            ).map_elements(
                lambda s: ais_lookup(
                    sess,
                    API_KEY,
                    s["api_address"],
                    s[zip_field],
                    enrichment_fields,
                    s["is_addr"],
                    s["is_philly_addr"],
                    s["output_address"],
                    srid_4326,
                    srid_2272
                ),
                return_dtype=new_cols,
            )
        else:
            struct_expr = pl.struct(
                ["api_address", "output_address", "is_addr", "is_philly_addr"]
            ).map_elements(
                lambda s: ais_lookup(
                    sess,
                    API_KEY,
                    s["api_address"],
                    None,
                    enrichment_fields,
                    s["is_addr"],
                    s["is_philly_addr"],
                    s["output_address"],
                    srid_4326,
                    srid_2272
                ),
                return_dtype=new_cols,
            )

        tmp_name = "ais_struct"

        added = (
            to_add.with_columns(struct_expr.alias(tmp_name))
            .with_columns(
                *[pl.col(tmp_name).struct.field(n).alias(n) for n in field_names]
            )
            .drop(tmp_name, "api_address")  # Drop the temporary api_address column
        )

    return added


def enrich_with_tomtom(parser, config: dict, to_add: pl.LazyFrame) -> pl.LazyFrame:
    """
    Enrich a lazy frame with latitude and longitude from TomTom.

    Args:
        parser: A passyunk parser object. Used to standardize TomTom output.
        config: A dictionary containing config information
        to_add: A polars lazyframe to be enriched

    Returns:
        An enriched polars lazyframe.
    """

    # Create augmented address for undefined locations

    to_add = to_add.with_columns(
        pl.when(pl.col("is_undefined"))
        .then(pl.concat_str([pl.col("raw_address"), pl.lit(", Philadelphia, PA")]))
        .otherwise(pl.col("raw_address"))
        .alias("raw_api_address")
    )

    srid_4326 = config.get("srid_4326")
    srid_2272 = config.get("srid_2272")

    struct_fields = [
        pl.Field("output_address", pl.String),
        pl.Field("geocoder_used", pl.String),
        pl.Field("is_addr", pl.Boolean),
        pl.Field("is_philly_addr", pl.Boolean),
    ]

    if srid_4326:
        struct_fields.extend([
            pl.Field("geocode_lat", pl.String),
            pl.Field("geocode_lon", pl.String),
        ])

    if srid_2272:
        struct_fields.extend([
            pl.Field("geocode_x", pl.String),
            pl.Field("geocode_y", pl.String)
        ])

    new_cols = pl.Struct(struct_fields)
    field_names = [f.name for f in new_cols.fields]

    with requests.Session() as sess:
        added = (
            # Use the joined raw (not parsed with passyunk) address for tomtom, as passyunk parser
            # may sometimes strip out key information
            to_add.with_columns(
                pl.struct(["raw_api_address", "output_address"])
                .map_elements(
                    lambda cols: tomtom_lookup(
                        sess,
                        parser,
                        ZIPS,
                        cols["raw_api_address"],
                        cols["output_address"],
                        srid_4326,
                        srid_2272,
                    ),
                    return_dtype=new_cols,
                )
                .alias("tomtom_struct")
            )
            .with_columns(
                *[
                    pl.col("tomtom_struct").struct.field(n).alias(n)
                    for n in field_names
                ]
            )
            .drop("tomtom_struct", "raw_api_address")
        )

    return added


@click.command()
@click.option(
    "--config_path",
    default="./config.yml",
    prompt=True,
    show_default="./config.yml",
    help="The path to the config file.",
)
def process_csv(config_path):
    """
    Given a config file with the csv filepath, normalizes records
    in that file using Passyunk.

    Args:
        config_path (str): The path to the config file
    """
    current_time = get_current_time()
    print(f"Beginning enrichment process at {current_time}.")

    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    srid_4326 = config.get("srid_4326")
    srid_2272 = config.get("srid_2272")

    if not srid_4326 and not srid_2272:
        raise ValueError(
            "Invalid configuration: At least one SRID must be enabled. "
            "Set srid_4326 or srid_2272 to true in your config file."
        )

    filepath = config.get("input_file")
    geo_filepath = config.get("geography_file")

    if not filepath:
        raise ValueError("An input filepath must be specified in the config file.")

    if not geo_filepath:
        raise ValueError(
            "A filepath for the geography file must bespecified in the config."
        )

    # Determine which fields in the file are the address fields
    address_fields = find_address_fields(config)

    # Detect input file encoding
    encoding = detect_file_encoding(filepath)

    # Save original filepath for output naming
    original_filepath = filepath

    # If encoding is not UTF-8, recode it
    utf8_filepath = ""
    if encoding != "UTF-8":
        print(f"Converting file encoding from {encoding} to UTF-8")

        with tempfile.NamedTemporaryFile(
            mode="w", suffix=".csv", delete=False, encoding="utf-8"
        ) as temp_file:
            utf8_filepath = temp_file.name

        recode_to_utf8(filepath, utf8_filepath, encoding)
        filepath = utf8_filepath

    try:
        # infer schema = False infers everything as a string. Otherwise, polars
        # will attempt to infer zip codes like 19114-3409 as an int
        lf = pl.scan_csv(
            filepath,
            row_index_name="__geocode_idx__",
            infer_schema=False,
            encoding="utf8-lossy",
        )

        # Check if there are invalid address fields specified
        file_cols = lf.collect_schema().names()
        address_fields_list = [field for field in address_fields.values() if field]
        diff = [field for field in address_fields_list if field not in file_cols]

        if diff:
            raise ValueError(
                "The following fields specified in the config"
                f"file are not present in the input file: {diff}"
            )

        # ---------------- Join Addresses to Address File -------------------#

        passyunk_address_field = address_fields.get(
            "full_address"
        ) or address_fields.get("street_address")

        parser = PassyunkParser()

        # Create raw address field, used later to attempt to match
        # raw address against TomTom if the passyunk parsed address
        # fails to match
        lf = lf.with_columns(pl.col(passyunk_address_field).alias("raw_address"))

        lf = parse_with_passyunk_parser(parser, passyunk_address_field, lf)

        # After parsing with Passyunk, rebuild joined_address using the cleaned output_address
        # Only do this for split address fields (street/city/state/zip)
        # Don't do this for full_address fields, as Passyunk strips city/state
        if "street_address" in address_fields.keys():
            # Build list of available location components
            location_components = []
            for key in ["city", "state", "zip"]:
                if key in address_fields.keys() and address_fields[key] is not None:
                    location_components.append(
                        pl.col(address_fields[key]).fill_null("")
                    )

            lf = lf.with_columns(
                pl.when(pl.col("output_address").is_not_null())
                .then(
                    pl.concat_str(
                        [pl.col("output_address")] + location_components,
                        separator=" ",
                    )
                    .str.replace_all(r"\s+", " ")
                    .str.strip_chars()
                )
                .otherwise(pl.col(passyunk_address_field))
                .alias("joined_address"),

                pl.concat_str(
                [pl.col("raw_address")] + location_components,
                separator=" ",
                ).str.replace_all(r"\s+", " ")\
                    .str.strip_chars()\
                        .alias("raw_address"),  # overwrite raw_address in place
            )
        else:
            # For full_address cases, use the original field as joined_address
            lf = lf.with_columns(pl.col(passyunk_address_field).alias("joined_address"))

        # ---------------- Split out Non Philly Addresses -------------------#
        philly_lf, non_philly_lf = split_non_philly_address(config, lf)

        # Generate the names of columns to add for both the AIS API
        # and the address file
        ais_enrichment_fields, address_file_enrichment_fields = build_enrichment_fields(
            config
        )

        joined_lf = add_address_file_fields(
            geo_filepath, philly_lf, address_file_enrichment_fields, config
        )

        # Split out fields that did not match the address file
        # and attempt to match them with the AIS API

        # -------------------------- Add Fields from AIS ------------------ #
        has_geo, needs_geo = split_geos(joined_lf, config)

        uses_full_address = bool(address_fields.get("full_address"))
        ais_enriched = enrich_with_ais(
            config, needs_geo, uses_full_address, ais_enrichment_fields
        )

        ais_rejoined = pl.concat([has_geo, ais_enriched]).sort("__geocode_idx__")

        # -------------- Check Match Failures Against TomTom ------------------ #

        has_geo, needs_geo = split_geos(ais_rejoined, config)

        # Rejoin the addresses marked as non-philly for tomtom search
        # at the beginning of the process
        needs_geo = pl.concat([non_philly_lf, needs_geo], how="diagonal").sort(
            "__geocode_idx__"
        )

        tomtom_enriched = enrich_with_tomtom(parser, config, needs_geo)

        # -------------- Check TomTom matches against AIS again ---------------- #

        # This melted my brain a little bit so I'm writing it out here:
        # 1. We see which records that TomTom failed to match are in Philly
        # 2. We reinrich those with AIS to see if the new TomTom parsed address is
        # searchable with AIS, allowing us to potentially recover enrichment fields
        # 3. That either geocodes or doesn't. We take the records that AIS failed to geocode.
        # 4. We use the tomtom matched record for the records that AIS failed to geocode.
        # 5. We rejoin those to the records that AIS did manage to reinrich
        # 6. We rejoin those records to the non-philadelphia records that shouldn't be run through AIS
        # 7. We rejoin that again back to the original 'has_geo' -- the records that never needed
        # to be matched to TomTom in the first place.

        tomtom_enriched_non_philly = tomtom_enriched.filter(pl.col("is_non_philly"))
        tomtom_enriched_is_philly = tomtom_enriched.filter(~pl.col("is_non_philly"))

        ais_reinriched = enrich_with_ais(config, tomtom_enriched_is_philly, uses_full_address, ais_enrichment_fields)

        reinriched_has_geo, reinriched_needs_geo = split_geos(ais_reinriched, config)

        # Indicate that the record was geocoded with a combination of tomtom and AIS
        reinriched_has_geo = reinriched_has_geo.with_columns(
            pl.col("geocoder_used").str.replace("ais", "tomtom-ais").alias("geocoder_used")
        )

        failed_idx = reinriched_needs_geo.select("__geocode_idx__")

        tomtom_fallback = tomtom_enriched.join(failed_idx, on="__geocode_idx__", how="inner")

        cols = tomtom_enriched.collect_schema().names()

        # Make sure rejoined tables have same fields in same order
        reinriched_rejoined = pl.concat([reinriched_has_geo, tomtom_fallback], how="diagonal").select(cols)
        non_philly_rejoined = pl.concat([tomtom_enriched_non_philly, reinriched_rejoined], how="diagonal").select(cols)

        rejoined = (
            pl.concat([has_geo, non_philly_rejoined])
            .sort("__geocode_idx__")
            .drop(
                ["__geocode_idx__", "joined_address", "is_non_philly", "is_undefined", "raw_address"]
            )
        )

        # Reorder fields so that all geocode fields are adjacent
        final_cols = rejoined.collect_schema().names()

        # Remove all geocode columns from the list
        geo_cols = []
        if srid_4326:
            geo_cols.extend(["geocode_lat", "geocode_lon"])

        if srid_2272:
            geo_cols.extend(["geocode_x", "geocode_y"])

        cols_without_geo = [c for c in final_cols if c not in geo_cols]

        if "geocoder_used" in cols_without_geo:
            insert_idx = cols_without_geo.index("geocoder_used") + 1
        else:
            insert_idx = 0

        # Insert all geocode columns together after geocoder_used
        ordered_cols = (
            cols_without_geo[:insert_idx] +
            geo_cols +
            cols_without_geo[insert_idx:]
        )

        # Drop raw address field, no longer need it after tomtom match
        rejoined = rejoined.select(ordered_cols)

        # -------------------- Save Output File ---------------------- #

        in_path = PurePath(original_filepath)

        # If filepath has multiple suffixes, remove them
        stem = in_path.name.replace("".join(in_path.suffixes), "")

        out_path = f"{stem}_enriched.csv"

        out_path = str(in_path.parent / out_path)

        rejoined.sink_csv(out_path)

        current_time = get_current_time()
        print(f"Enrichment complete at {current_time}.")

    finally:
        if utf8_filepath:
            os.remove(utf8_filepath)


if __name__ == "__main__":
    process_csv()