|
2 | 2 | import numpy as np |
3 | 3 | import os # Import os for path manipulation |
4 | 4 |
|
| 5 | + |
| 6 | +def build_extra_info(value: object, index: int) -> dict[str, object]: |
| 7 | + if isinstance(value, dict): |
| 8 | + extra_info = dict(value) |
| 9 | + else: |
| 10 | + extra_info = {} |
| 11 | + extra_info["index"] = index |
| 12 | + return extra_info |
| 13 | + |
5 | 14 | # --- Configuration --- |
6 | 15 | # Define the directory containing the input file |
7 | 16 | data_directory = 'data' |
|
30 | 39 | dummy_df.to_parquet(input_parquet_path) |
31 | 40 |
|
32 | 41 |
|
33 | | - # Read the Parquet file into a pandas DataFrame |
34 | | - # We don't need the original index, so we can reset it immediately if needed, |
35 | | - # but setting df.index directly below overwrites it anyway. |
| 42 | + # Read the Parquet file into a pandas DataFrame. |
36 | 43 | print(f"Reading Parquet file from: {input_parquet_path}") |
37 | 44 | df = pd.read_parquet(input_parquet_path) |
38 | 45 | print("Original DataFrame info:") |
|
45 | 52 | num_rows = len(df) |
46 | 53 | print(f"\nDataFrame has {num_rows} rows.") |
47 | 54 |
|
48 | | - # Create a new sequential index starting from 1 up to the number of rows |
49 | | - # Name the new index 'extra_info' as requested |
50 | | - print("Generating new sequential index named 'extra_info' from 1...") |
51 | | - new_index = pd.RangeIndex(start=1, stop=num_rows + 1, step=1, name='extra_info') |
| 55 | + # RLHFDataset reads row_dict["extra_info"]["index"], so store the repeat |
| 56 | + # index inside the extra_info column rather than as a pandas index. |
| 57 | + print("Generating 0-based extra_info.index values...") |
| 58 | + if "extra_info" in df.columns: |
| 59 | + existing_extra_info = df["extra_info"].tolist() |
| 60 | + else: |
| 61 | + existing_extra_info = [None] * num_rows |
52 | 62 |
|
53 | | - # Set the new index for the DataFrame, replacing the old one |
54 | | - df.index = new_index |
55 | | - print("New index assigned.") |
| 63 | + df["extra_info"] = [ |
| 64 | + build_extra_info(value=value, index=index) |
| 65 | + for index, value in enumerate(existing_extra_info) |
| 66 | + ] |
| 67 | + df = df.reset_index(drop=True) |
| 68 | + print("extra_info.index assigned.") |
56 | 69 |
|
57 | 70 | # Write the modified DataFrame back to a new Parquet file |
58 | | - # index=True ensures the new index ('extra_info') is written to the file |
59 | 71 | print(f"Writing modified DataFrame to: {output_parquet_path}") |
60 | | - df.to_parquet(output_parquet_path, index=True) |
| 72 | + df.to_parquet(output_parquet_path, index=False) |
61 | 73 |
|
62 | 74 | print("\n--- Success ---") |
63 | 75 | print(f"Successfully processed '{input_parquet_path}'.") |
64 | | - print(f"Created new index named 'extra_info' from 1 to {num_rows}.") |
| 76 | + if num_rows: |
| 77 | + print(f"Created 0-based extra_info.index values from 0 to {num_rows - 1}.") |
| 78 | + else: |
| 79 | + print("Created empty extra_info.index values.") |
65 | 80 | print(f"Output saved to '{output_parquet_path}'.") |
66 | 81 |
|
67 | 82 | # Display the first few rows with the new index to verify |
|
0 commit comments