|
1 | 1 | import pathlib |
| 2 | +import sys |
2 | 3 | from itertools import islice |
3 | 4 |
|
4 | 5 | import flatgfa |
5 | 6 |
|
6 | | - |
7 | | -def iter_bits_ones_dash(path, chunk_bytes: int = 64 * 1024): |
8 | | - ws = b" \t\r\n\v\f" |
9 | | - with open(path, "rb") as f: |
10 | | - while True: |
11 | | - chunk = f.read(chunk_bytes) |
12 | | - if not chunk: |
13 | | - break |
14 | | - for b in chunk: |
15 | | - if b in ws: |
16 | | - continue |
17 | | - if b == ord("1"): |
18 | | - yield True |
19 | | - elif b == ord("0"): |
20 | | - yield False |
21 | | - else: |
22 | | - raise ValueError( |
23 | | - f"Invalid character in file: {chr(b)!r} (expected '1' or '0')" |
24 | | - ) |
25 | | - |
26 | | - |
27 | | -def compare_row_to_file(row_bits, path) -> tuple[bool, int | None]: |
28 | | - n = len(row_bits) |
29 | | - i = 0 |
30 | | - for fb in iter_bits_ones_dash(path): |
31 | | - if i >= n: |
32 | | - # file has extra bits |
33 | | - raise ValueError("File contains more bits than the row length.") |
34 | | - if fb != bool(row_bits[i]): |
35 | | - return False, i |
36 | | - i += 1 |
37 | | - |
38 | | - if i != n: |
39 | | - # file ended early |
40 | | - raise ValueError(f"File ended early: got {i} bits, expected {n}.") |
41 | | - return True, None |
42 | | - |
43 | | - |
44 | | -TEST_DIR = pathlib.Path(__file__).parent |
45 | | -TEST_GFA = TEST_DIR / "./matrix_gaf_folder/Chr3.gfa" |
46 | | -GAF_DIR = (TEST_DIR / "./matrix_gaf_folder").resolve() |
47 | | -# MASTER_TXT = TEST_DIR / "./matrix_gaf_folder/Chr3-pangenotype" |
48 | | - |
49 | | -graph = flatgfa.parse(str(TEST_GFA)) |
50 | | -gaf = [str(p) for p in GAF_DIR.glob("*.gaf")] |
51 | | -pangenotype_matrix = graph.make_pangenotype_matrix(gaf) |
52 | | -# assert len(pangenotype_matrix) == len(gaf) |
53 | | -# first_row = list(pangenotype_matrix[0]) |
54 | | -# ok, where = compare_row_to_file(first_row, str(MASTER_TXT)) |
55 | | -# print("OK (all columns match)" if ok else f"Mismatch at column {where}") |
56 | 7 | FIRST_N = 100 |
57 | | -for gaf_path, row in zip(gaf, pangenotype_matrix): |
58 | | - row01 = [int(b) for b in row] |
59 | | - # print(pathlib.Path(gaf_path).name, *row01) |
60 | | - first_bits = islice(row, FIRST_N) |
61 | | - print(pathlib.Path(gaf_path).name, *map(int, first_bits)) |
| 8 | + |
| 9 | + |
| 10 | +def matrix_demo(gfa_path, gaf_dir): |
| 11 | + # Construct a pangenotype matrix for the `gfa_path` graph and all the GAF |
| 12 | + # files in `gaf_dir`. |
| 13 | + graph = flatgfa.parse(gfa_path) |
| 14 | + gaf = [str(p) for p in pathlib.Path(gaf_dir).glob("*.gaf")] |
| 15 | + pangenotype_matrix = graph.make_pangenotype_matrix(gaf) |
| 16 | + |
| 17 | + assert len(pangenotype_matrix) == len(gaf) |
| 18 | + |
| 19 | + # Just print out a few entries from the matrix. |
| 20 | + for gaf_path, row in zip(gaf, pangenotype_matrix): |
| 21 | + first_bits = islice(row, FIRST_N) |
| 22 | + print(pathlib.Path(gaf_path).name, *map(int, first_bits)) |
| 23 | + |
| 24 | + |
| 25 | +if __name__ == "__main__": |
| 26 | + if len(sys.argv) != 3: |
| 27 | + print("usage: matrix.py <GFA file> <GAF directory>") |
| 28 | + sys.exit(1) |
| 29 | + matrix_demo(*sys.argv[1:]) |
0 commit comments