1818
1919#include < nlohmann/json.hpp>
2020
21+ #include < cstdio>
2122#include < iomanip>
2223#include < iostream>
24+ #include < iterator>
2325
2426using json = nlohmann::json;
2527
@@ -86,6 +88,57 @@ json eigenColToJson(const MatrixX& matrix, int column)
8688 return j;
8789}
8890
91+ // Sanitize a raw string by replacing invalid UTF-8 sequences with '?'
92+ // (to work around a bug in Open Babel space group output)
93+ std::string sanitizeUtf8 (const std::string& s)
94+ {
95+ std::string result;
96+ result.reserve (s.size ());
97+ const unsigned char * bytes = reinterpret_cast <const unsigned char *>(s.data ());
98+ size_t len = s.size ();
99+
100+ for (size_t i = 0 ; i < len; ++i) {
101+ if (bytes[i] <= 0x7F ) {
102+ result.push_back (static_cast <char >(bytes[i]));
103+ continue ;
104+ }
105+
106+ size_t remaining = 0 ;
107+ if ((bytes[i] & 0xE0 ) == 0xC0 )
108+ remaining = 1 ;
109+ else if ((bytes[i] & 0xF0 ) == 0xE0 )
110+ remaining = 2 ;
111+ else if ((bytes[i] & 0xF8 ) == 0xF0 )
112+ remaining = 3 ;
113+ else {
114+ result.push_back (' ?' ); // invalid lead byte
115+ continue ;
116+ }
117+
118+ if (i + remaining >= len) {
119+ result.push_back (' ?' ); // truncated sequence
120+ continue ;
121+ }
122+
123+ bool valid = true ;
124+ for (size_t j = 1 ; j <= remaining; ++j) {
125+ if ((bytes[i + j] & 0xC0 ) != 0x80 ) {
126+ valid = false ;
127+ break ;
128+ }
129+ }
130+
131+ if (valid) {
132+ for (size_t j = 0 ; j <= remaining; ++j)
133+ result.push_back (static_cast <char >(bytes[i + j]));
134+ i += remaining;
135+ } else {
136+ result.push_back (' ?' ); // invalid continuation
137+ }
138+ }
139+ return result;
140+ }
141+
89142bool CjsonFormat::read (std::istream& file, Molecule& molecule)
90143{
91144 return deserialize (file, molecule, true );
@@ -95,6 +148,7 @@ bool CjsonFormat::deserialize(std::istream& file, Molecule& molecule,
95148 bool isJson)
96149{
97150 json jsonRoot;
151+
98152 // could throw parse errors
99153 try {
100154 if (isJson)
@@ -109,8 +163,18 @@ bool CjsonFormat::deserialize(std::istream& file, Molecule& molecule,
109163 return false ;
110164 }
111165
166+ if (jsonRoot.is_discarded () && isJson) {
167+ // Initial parse failed - try sanitizing UTF-8 and re-parsing
168+ file.clear ();
169+ file.seekg (0 );
170+ std::string content ((std::istreambuf_iterator<char >(file)),
171+ std::istreambuf_iterator<char >());
172+ std::string sanitizedContent = sanitizeUtf8 (content);
173+ jsonRoot = json::parse (sanitizedContent, nullptr , false );
174+ }
175+
112176 if (jsonRoot.is_discarded ()) {
113- appendError (" Error reading CJSON file." );
177+ appendError (" Error reading CJSON file. Root is discarded. " );
114178 return false ;
115179 }
116180
@@ -474,18 +538,20 @@ bool CjsonFormat::deserialize(std::istream& file, Molecule& molecule,
474538 return false ;
475539 }
476540 }
477- if (unitCellObject != nullptr )
541+ if (unitCellObject != nullptr ) {
478542 molecule.setUnitCell (unitCellObject);
479543
480- // check for Hall number if present
481- if (unitCell[" hallNumber" ].is_number ()) {
482- auto hallNumber = static_cast <int >(unitCell[" hallNumber" ]);
483- if (hallNumber > 0 && hallNumber < 531 )
484- molecule.setHallNumber (hallNumber);
485- } else if (unitCell[" spaceGroup" ].is_string ()) {
486- auto hallNumber = Core::SpaceGroups::hallNumber (unitCell[" spaceGroup" ]);
487- if (hallNumber != 0 )
488- molecule.setHallNumber (hallNumber);
544+ // check for Hall number if present
545+ if (unitCell[" hallNumber" ].is_number ()) {
546+ auto hallNumber = static_cast <int >(unitCell[" hallNumber" ]);
547+ if (hallNumber > 0 && hallNumber < 531 )
548+ molecule.setHallNumber (hallNumber);
549+ } else if (unitCell[" spaceGroup" ].is_string ()) {
550+ auto hallNumber =
551+ Core::SpaceGroups::hallNumber (unitCell[" spaceGroup" ]);
552+ if (hallNumber != 0 )
553+ molecule.setHallNumber (hallNumber);
554+ }
489555 }
490556 }
491557 }
0 commit comments