Skip to content

Commit 33b5f5c

Browse files
committed
feat(#141): embed
1 parent 966b8bd commit 33b5f5c

4 files changed

Lines changed: 4 additions & 4 deletions

File tree

sr-data/src/sr_data/steps/embed.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def main(repos, prefix, hf, cohere):
5656
embed_cohere(cohere, frame, prefix)
5757
else:
5858
logger.info(f"Inference checkpoint: {checkpoint}")
59-
embeddings = pd.DataFrame(infer(frame["top"].tolist(), checkpoint, hf))
59+
embeddings = pd.DataFrame(infer(frame["mcw"].tolist(), checkpoint, hf))
6060
embeddings.insert(0, 'repo', frame["repo"])
6161
embeddings.to_csv(f"{prefix}-{model}.csv", index=False)
6262
logger.info(

sr-data/src/sr_data/steps/extract.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def main(repos, out):
6161
logger.info(
6262
f"Removed {headingless - len(frame)} repositories that have 0 headings after regex filtering ('{rword}')"
6363
)
64-
frame["top"] = frame["headings"].apply(
64+
frame["mcw"] = frame["headings"].apply(
6565
lambda headings: top_words(headings, 5)
6666
)
6767
frame.to_csv(out, index=False)

sr-data/src/tests/test_embed.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def test_creates_csv_with_embeddings(self):
9898
with TemporaryDirectory() as temp:
9999
main(
100100
os.path.join(
101-
os.path.dirname(os.path.realpath(__file__)), "embed.csv"
101+
os.path.dirname(os.path.realpath(__file__)), "to-embed.csv"
102102
),
103103
temp,
104104
os.environ["HF_TESTING_TOKEN"],
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
repo,top
1+
repo,mcw
22
foo,"['test', 'dummy']"

0 commit comments

Comments
 (0)