Skip to content

csv_example.py - program terminated without error message #128

@surianisha

Description

@surianisha

Couldn't find 'csv_example_training.json' in the repo, so used 'csv_input_with_true_ids.csv'. There was no setting file either so couldn't use that (commented out in code as shared below).
Made sure to use consoleLabel() instead of console_label().

Followed the steps in csv_example.py. Active learning got initiated but the program terminates without error message.
Screen Shot 2022-06-09 at 7 59 19 PM

The code is below:
##################################################
import os
import csv
import re
import logging
import optparse

import dedupe
from unidecode import unidecode

def preProcess(column):

column = unidecode(column)
column = re.sub('  +', ' ', column)
column = re.sub('\n', ' ', column)
column = column.strip().strip('"').strip("'").lower().strip()


if not column:
    column = None
return column

def readData(filename):

data_d = {}
with open(filename) as f:
    reader = csv.DictReader(f)
    for row in reader:
        clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
        row_id = int(row['Id'])
        data_d[row_id] = dict(clean_row)
        
return data_d

example

path = '/Users/asuri/Downloads/dedupe-examples-master/csv_example/'
filename = 'csv_example_messy_input.csv'

#######################################

if name == 'main':

optp = optparse.OptionParser()
optp.add_option('-v', '--verbose', dest='verbose', action='count',
                help='Increase verbosity (specify multiple times for more)'
                )
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
    if opts.verbose == 1:
        log_level = logging.INFO
    elif opts.verbose >= 2:
        log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)

input_file = path + filename
output_file = path + 'output.csv'
#settings_file = 'csv_example_learned_settings'
training_file = path + 'csv_input_with_true_ids.csv'

print('importing data ...')
data_d = readData(input_file)
    

    

fields = [
        {'field': 'Site name', 'type': 'String'},
        {'field': 'Address', 'type': 'String'},
        {'field': 'Zip', 'type': 'Exact', 'has missing': True},
        {'field': 'Phone', 'type': 'String', 'has missing': True},
        ]


    
deduper = dedupe.Dedupe(fields)

if os.path.exists(training_file):
    print('reading labeled examples from ', training_file)
    with open(training_file, 'rb') as f:
        deduper.prepare_training(data_d,f)
else:
    deduper.prepare_training(data_d)
    
print('starting active labeling...')

#as of 2.0 this method is called console_label() but in 1.x it was called consoleLabel(), that difference may account for the error. Now updated to consoleLabel

dedupe.consoleLabel(deduper)

deduper.train()

with open(training_file, 'w') as tf:
    deduper.write_training(tf)

print('clustering...')
clustered_dupes = deduper.partition(data_d, 0.5)
print('# duplicate sets', len(clustered_dupes))

cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
for record_id, score in zip(records, scores):
cluster_membership[record_id] = {
"Cluster ID": cluster_id,
"confidence_score": score
}

with open(output_file, 'w') as f_output, open(input_file) as f_input:

reader = csv.DictReader(f_input)
fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

writer = csv.DictWriter(f_output, fieldnames=fieldnames)
writer.writeheader()

for row in reader:
    row_id = int(row['id'])
    row.update(cluster_membership[row_id])
    writer.writerow(row)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions