Skip to content

Commit 83efa40

Browse files
author
Simon Frost
committed
Add Windows compatibility, geo_loc_name support, and fix memory leaks
- Add support for /geo_loc_name qualifier (GenBank 2024 change) in addition to /country - Add Windows compatibility with portable strptime and getopt implementations - Replace VLA with dynamic allocation for MSVC compatibility - Fix memory leak: sCountry2 was allocated but never freed in main loop - Fix memory leak: add freeSingleGBData() for proper cleanup on parse errors - Update README with Windows build instructions (WSL, MSYS2, PCRE2, TRE)
1 parent adbead3 commit 83efa40

File tree

3 files changed

+271
-40
lines changed

3 files changed

+271
-40
lines changed

README.md

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ This little C program will extract the following information from a GenBank file
99
- length
1010
- submission date
1111
- host
12-
- country
12+
- country (supports both `/country` and `/geo_loc_name` qualifiers)
1313
- collection date
1414

1515
In addition to extracting this information, dates are reformatted e.g. `31-DEC-2001` becomes `2001-12-31`, which makes them more digestible to downstream software like BEAST, and country names are cleaned and matched to ISO3 codes.
@@ -30,6 +30,8 @@ gbmunge [-h] -i <Genbank_file> -f <sequence_output> -o <metadata_output> [-t] [-
3030

3131
## Building
3232

33+
### Linux and macOS
34+
3335
```sh
3436
git clone https://github.com/sdwfrost/gbmunge
3537
cd gbmunge
@@ -38,6 +40,40 @@ make
3840

3941
This will build `gbmunge` in the `src/` directory. Add the directory to the path, or move the executable somewhere.
4042

43+
### Windows
44+
45+
There are several options for building on Windows:
46+
47+
1. **Using WSL (Windows Subsystem for Linux)** (Recommended):
48+
```sh
49+
# Install WSL with Ubuntu, then in the WSL terminal:
50+
sudo apt update
51+
sudo apt install build-essential
52+
cd gbmunge
53+
make
54+
```
55+
56+
2. **Using MSYS2/MinGW**:
57+
```sh
58+
# Install MSYS2, then in MSYS2 terminal:
59+
pacman -S mingw-w64-x86_64-gcc make
60+
cd gbmunge
61+
make
62+
```
63+
64+
3. **Using Visual Studio with vcpkg**:
65+
Building natively with MSVC requires a POSIX-compatible regex library:
66+
```sh
67+
# Install PCRE2 via vcpkg
68+
vcpkg install pcre2:x64-windows
69+
70+
# Compile with PCRE2 support (modify Makefile or compile manually)
71+
cl /DGBMUNGE_USE_PCRE2 /I<vcpkg_include_path> gbfp.c gbmunge.c /link pcre2-8.lib
72+
```
73+
74+
4. **Using TRE regex library**:
75+
Download TRE from https://github.com/laurikari/tre and place the POSIX-compatible `regex.h` in the `src/` directory.
76+
4177
## Testing
4278

4379
A Genbank file of MERS Coronavirus sequences is provided in the `test/` directory.

src/gbfp.c

Lines changed: 56 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,27 @@
22

33
#include <stdio.h>
44
#include <stdlib.h>
5-
#include <regex.h>
6-
#include <unistd.h>
75
#include <string.h>
86
#include <ctype.h>
97
#include <limits.h>
10-
#include <sys/types.h>
8+
9+
#ifdef _WIN32
10+
#include <windows.h>
11+
/* For Windows, use PCRE2 or bundled regex - user must install PCRE2 */
12+
/* Alternatively, define GBMUNGE_USE_PCRE2 and link with pcre2 */
13+
#ifdef GBMUNGE_USE_PCRE2
14+
#define PCRE2_CODE_UNIT_WIDTH 8
15+
#include <pcre2.h>
16+
/* PCRE2 wrapper - would need implementation */
17+
#else
18+
/* Fallback: use bundled regex implementation or TRE */
19+
#include "regex.h" /* User must provide POSIX-compatible regex.h for Windows */
20+
#endif
21+
#else
22+
#include <regex.h>
23+
#include <unistd.h>
24+
#include <sys/types.h>
25+
#endif
1126

1227
#include "gbfp.h"
1328

@@ -55,6 +70,9 @@ void freeRegEx(void) {
5570
regfree(&ptRegExGI);
5671
}
5772

73+
/* Forward declaration for freeSingleGBData */
74+
static void freeSingleGBData(gb_data *ptGBData);
75+
5876
/* Removes white spaces at end of a string */
5977
static void rtrim(gb_string sLine) {
6078
register int i;
@@ -640,7 +658,7 @@ static gb_data *_parseGBFF(FILE *FSeqFile) {
640658

641659
/* Parse LOCUS line */
642660
if (parseLocus(sLine, ptGBData) != 0) {
643-
free(ptGBData);
661+
freeSingleGBData(ptGBData);
644662
return NULL;
645663
}
646664

@@ -695,36 +713,35 @@ gb_data **parseGBFF(gb_string spFileName) {
695713
return pptGBDatas;
696714
}
697715

698-
void freeGBData(gb_data **pptGBData) {
716+
/* Free a single gb_data structure and all its contents */
717+
static void freeSingleGBData(gb_data *ptGBData) {
699718
unsigned int i;
700-
gb_data *ptGBData = NULL;
701719
gb_feature *ptFeatures = NULL;
702720
gb_reference *ptReferences = NULL;
703721
unsigned int iFeatureNum = 0;
704722
unsigned int iReferenceNum = 0;
705-
unsigned int iSeqPos = 0;
706-
707-
for (iSeqPos = 0; *(pptGBData + iSeqPos) != NULL; iSeqPos++) {
708-
ptGBData = *(pptGBData + iSeqPos);
709723

710-
ptFeatures = ptGBData->ptFeatures;
711-
iFeatureNum = ptGBData->iFeatureNum;
724+
if (ptGBData == NULL) return;
712725

726+
ptFeatures = ptGBData->ptFeatures;
727+
iFeatureNum = ptGBData->iFeatureNum;
713728

714-
/* Release memory space for features */
729+
/* Release memory space for features */
730+
if (ptFeatures != NULL) {
715731
for (i = 0; i < iFeatureNum; i++) {
716732
free((ptFeatures + i)->ptLocation);
717733
if ((ptFeatures + i)->ptQualifier != NULL) {
718734
free(((ptFeatures + i)->ptQualifier)->sQualifier);
719735
free((ptFeatures + i)->ptQualifier);
720736
}
721737
}
722-
723738
free(ptFeatures);
739+
}
724740

725-
/* Release memory space for References */
726-
ptReferences = ptGBData->ptReferences;
727-
iReferenceNum = ptGBData->iReferenceNum;
741+
/* Release memory space for References */
742+
ptReferences = ptGBData->ptReferences;
743+
iReferenceNum = ptGBData->iReferenceNum;
744+
if (ptReferences != NULL) {
728745
for (i = 0; i < iReferenceNum; i++) {
729746
free((ptReferences + i)->sAuthors);
730747
free((ptReferences + i)->sConsrtm);
@@ -735,19 +752,29 @@ void freeGBData(gb_data **pptGBData) {
735752
free((ptReferences + i)->sRemark);
736753
}
737754
free(ptReferences);
755+
}
738756

739-
free(ptGBData->sDef);
740-
free(ptGBData->sAccession);
741-
free(ptGBData->sComment);
742-
free(ptGBData->sGI);
743-
free(ptGBData->sKeywords);
744-
free(ptGBData->sLineage);
745-
free(ptGBData->sOrganism);
746-
free(ptGBData->sSequence);
747-
free(ptGBData->sSource);
748-
free(ptGBData->sVersion);
749-
750-
free(ptGBData);
757+
free(ptGBData->sDef);
758+
free(ptGBData->sAccession);
759+
free(ptGBData->sComment);
760+
free(ptGBData->sGI);
761+
free(ptGBData->sKeywords);
762+
free(ptGBData->sLineage);
763+
free(ptGBData->sOrganism);
764+
free(ptGBData->sSequence);
765+
free(ptGBData->sSource);
766+
free(ptGBData->sVersion);
767+
768+
free(ptGBData);
769+
}
770+
771+
void freeGBData(gb_data **pptGBData) {
772+
unsigned int iSeqPos = 0;
773+
774+
if (pptGBData == NULL) return;
775+
776+
for (iSeqPos = 0; *(pptGBData + iSeqPos) != NULL; iSeqPos++) {
777+
freeSingleGBData(*(pptGBData + iSeqPos));
751778
}
752779

753780
free(pptGBData);

0 commit comments

Comments
 (0)