Skip to content

Commit 83292dc

Browse files
committed
added manifest
1 parent 5a1fd1b commit 83292dc

File tree

5 files changed

+28
-21
lines changed

5 files changed

+28
-21
lines changed

run

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,16 @@ import argparse
44
import datetime
55
import logging
66
import os
7+
import sys
78

89
from transcribe import aws, google, whisper
910

1011
parser = argparse.ArgumentParser(
1112
prog="run", description="Run transcription generation for sample data"
1213
)
1314

14-
parser.add_argument("--output_dir", help="Path to a directory to write results")
15+
parser.add_argument("--output-dir", help="Path to a directory to write results")
16+
parser.add_argument("--manifest", default="data.csv", help="Path to data manifest CSV")
1517
parser.add_argument(
1618
"--only",
1719
choices=["whisper", "preprocessing", "aws", "google"],
@@ -27,6 +29,10 @@ if output_dir is None:
2729
if not os.path.isdir(output_dir):
2830
os.makedirs(output_dir)
2931

32+
# ensure manifest CSV exists
33+
if not os.path.isfile(args.manifest):
34+
sys.exit(f"manifest file {args.manifest} doesn't exist")
35+
3036
logging.basicConfig(
3137
filename=os.path.join(output_dir, "transcribe.log"),
3238
filemode="a",
@@ -35,18 +41,20 @@ logging.basicConfig(
3541
level=logging.INFO,
3642
)
3743

38-
3944
# run one of the transcription types individually or run them all
4045
if args.only == "whisper":
41-
whisper.run(output_dir)
46+
whisper.run(output_dir, args.manifest)
4247
elif args.only == "preprocessing":
43-
whisper.run_preprocessing(output_dir)
48+
whisper.run_preprocessing(output_dir, args.manifest)
4449
elif args.only == "aws":
45-
aws.run(output_dir)
50+
aws.run(output_dir, args.manifest)
4651
elif args.only == "google":
47-
google.run(output_dir)
52+
google.run(output_dir, args.manifest)
4853
else:
49-
whisper.run(output_dir)
50-
whisper.run_preprocessing(output_dir)
51-
aws.run(output_dir)
52-
google.run(output_dir)
54+
whisper.run(output_dir, args.manifest)
55+
print()
56+
whisper.run_preprocessing(output_dir, args.manifest)
57+
print()
58+
aws.run(output_dir, args.manifest)
59+
print()
60+
google.run(output_dir, args.manifest)

transcribe/aws.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
dotenv.load_dotenv()
1919

2020

21-
def run(output_dir):
21+
def run(output_dir, manifest):
2222
results = []
23-
for file_metadata in tqdm.tqdm(utils.get_data_files(), desc="aws".ljust(10)):
23+
for file_metadata in tqdm.tqdm(utils.get_data_files(manifest), desc="aws".ljust(10)):
2424
file_metadata["run_count"] = len(results) + 1
2525
file = file_metadata["media_filename"]
2626

transcribe/google.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@
1313
from . import utils
1414

1515

16-
def run(output_dir):
16+
def run(output_dir, manifest):
1717
results = []
18-
for file_metadata in tqdm.tqdm(utils.get_data_files(), desc="google".ljust(10)):
18+
for file_metadata in tqdm.tqdm(utils.get_data_files(manifest), desc="google".ljust(10)):
1919
file_metadata["run_count"] = len(results) + 1
2020
file = file_metadata["media_filename"]
2121

transcribe/utils.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,9 @@
2828
]
2929

3030

31-
def get_data_files():
31+
def get_data_files(manifest):
3232
rows = []
33-
data_csv = Path(__file__).parent.parent / "data.csv"
34-
for row in csv.DictReader(open(data_csv)):
33+
for row in csv.DictReader(open(manifest)):
3534
rows.append(row)
3635
return rows
3736

transcribe/whisper.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,9 @@
4242
]
4343

4444

45-
def run(output_dir):
45+
def run(output_dir, manifest):
4646
combinations = list(whisper_option_combinations())
47-
files = utils.get_data_files()
47+
files = utils.get_data_files(manifest)
4848
total = len(combinations) * len(files)
4949
progress = tqdm.tqdm(total=total, desc="whisper".ljust(10))
5050

@@ -60,9 +60,9 @@ def run(output_dir):
6060
utils.write_report(results, csv_filename, extra_cols=["options"])
6161

6262

63-
def run_preprocessing(output_dir):
63+
def run_preprocessing(output_dir, manifest):
6464
results = []
65-
files = utils.get_data_files()
65+
files = utils.get_data_files(manifest)
6666
total = len(files) * len(preprocessing_combinations)
6767
progress = tqdm.tqdm(total=total, desc="preprocess".ljust(10))
6868

0 commit comments

Comments
 (0)