@@ -3874,6 +3874,30 @@ archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,
3874
3874
}
3875
3875
3876
3876
* p = NULL ;
3877
+ #if defined(_WIN32 ) && !defined(__CYGWIN__ )
3878
+ /*
3879
+ * On Windows, first try converting from WCS because (1) there's no
3880
+ * guarantee that the conversion to MBS will succeed, e.g. when using
3881
+ * CP_ACP, and (2) that's more efficient than converting to MBS, just to
3882
+ * convert back to WCS again before finally converting to UTF-8
3883
+ */
3884
+ if ((aes -> aes_set & AES_SET_WCS ) != 0 ) {
3885
+ sc = archive_string_conversion_to_charset (a , "UTF-8" , 1 );
3886
+ if (sc == NULL )
3887
+ return (-1 );/* Couldn't allocate memory for sc. */
3888
+ archive_string_empty (& (aes -> aes_utf8 ));
3889
+ r = archive_string_append_from_wcs_in_codepage (& (aes -> aes_utf8 ),
3890
+ aes -> aes_wcs .s , aes -> aes_wcs .length , sc );
3891
+ if (a == NULL )
3892
+ free_sconv_object (sc );
3893
+ if (r == 0 ) {
3894
+ aes -> aes_set |= AES_SET_UTF8 ;
3895
+ * p = aes -> aes_utf8 .s ;
3896
+ return (0 );/* success. */
3897
+ } else
3898
+ return (-1 );/* failure. */
3899
+ }
3900
+ #endif
3877
3901
/* Try converting WCS to MBS first if MBS does not exist yet. */
3878
3902
if ((aes -> aes_set & AES_SET_MBS ) == 0 ) {
3879
3903
const char * pm ; /* unused */
@@ -3958,6 +3982,32 @@ archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,
3958
3982
}
3959
3983
3960
3984
* wp = NULL ;
3985
+ #if defined(_WIN32 ) && !defined(__CYGWIN__ )
3986
+ /*
3987
+ * On Windows, prefer converting from UTF-8 directly to WCS because:
3988
+ * (1) there's no guarantee that the string can be represented in MBS (e.g.
3989
+ * with CP_ACP), and (2) in order to convert from UTF-8 to MBS, we're going
3990
+ * to need to convert from UTF-8 to WCS anyway and its wasteful to throw
3991
+ * away that intermediate result
3992
+ */
3993
+ if (aes -> aes_set & AES_SET_UTF8 ) {
3994
+ struct archive_string_conv * sc ;
3995
+
3996
+ sc = archive_string_conversion_from_charset (a , "UTF-8" , 1 );
3997
+ if (sc != NULL ) {
3998
+ archive_wstring_empty ((& aes -> aes_wcs ));
3999
+ r = archive_wstring_append_from_mbs_in_codepage (& (aes -> aes_wcs ),
4000
+ aes -> aes_utf8 .s , aes -> aes_utf8 .length , sc );
4001
+ if (a == NULL )
4002
+ free_sconv_object (sc );
4003
+ if (r == 0 ) {
4004
+ aes -> aes_set |= AES_SET_WCS ;
4005
+ * wp = aes -> aes_wcs .s ;
4006
+ return (0 );
4007
+ }
4008
+ }
4009
+ }
4010
+ #endif
3961
4011
/* Try converting UTF8 to MBS first if MBS does not exist yet. */
3962
4012
if ((aes -> aes_set & AES_SET_MBS ) == 0 ) {
3963
4013
const char * p ; /* unused */
@@ -4211,21 +4261,31 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
4211
4261
4212
4262
aes -> aes_set = AES_SET_UTF8 ; /* Only UTF8 is set now. */
4213
4263
4214
- /* Try converting UTF-8 to MBS, return false on failure. */
4215
4264
sc = archive_string_conversion_from_charset (a , "UTF-8" , 1 );
4216
4265
if (sc == NULL )
4217
4266
return (-1 );/* Couldn't allocate memory for sc. */
4218
- r = archive_strcpy_l (& (aes -> aes_mbs ), utf8 , sc );
4219
4267
4220
4268
#if defined(_WIN32 ) && !defined(__CYGWIN__ )
4221
- /* On failure, make an effort to convert UTF8 to WCS as the active code page
4222
- * may not be able to represent all characters in the string */
4223
- if (r != 0 ) {
4224
- if (archive_wstring_append_from_mbs_in_codepage (& (aes -> aes_wcs ),
4225
- aes -> aes_utf8 .s , aes -> aes_utf8 .length , sc ) == 0 )
4226
- aes -> aes_set = AES_SET_UTF8 | AES_SET_WCS ;
4227
- }
4228
- #endif
4269
+ /* On Windows, there's no good way to convert from UTF8 -> MBS directly, so
4270
+ * prefer to first convert to WCS as (1) it's wasteful to throw away the
4271
+ * intermediate result, and (2) WCS will still be set even if we fail to
4272
+ * convert to MBS (e.g. with ACP that can't represent the characters) */
4273
+ r = archive_wstring_append_from_mbs_in_codepage (& (aes -> aes_wcs ),
4274
+ aes -> aes_utf8 .s , aes -> aes_utf8 .length , sc );
4275
+
4276
+ if (a == NULL )
4277
+ free_sconv_object (sc );
4278
+ if (r != 0 )
4279
+ return (-1 ); /* This will guarantee we can't convert to MBS */
4280
+ aes -> aes_set = AES_SET_UTF8 | AES_SET_WCS ; /* Both UTF8 and WCS set. */
4281
+
4282
+ /* Try converting WCS to MBS, return false on failure. */
4283
+ if (archive_string_append_from_wcs (& (aes -> aes_mbs ), aes -> aes_wcs .s ,
4284
+ aes -> aes_wcs .length ))
4285
+ return (-1 );
4286
+ #else
4287
+ /* Try converting UTF-8 to MBS, return false on failure. */
4288
+ r = archive_strcpy_l (& (aes -> aes_mbs ), utf8 , sc );
4229
4289
4230
4290
if (a == NULL )
4231
4291
free_sconv_object (sc );
@@ -4237,8 +4297,10 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
4237
4297
if (archive_wstring_append_from_mbs (& (aes -> aes_wcs ), aes -> aes_mbs .s ,
4238
4298
aes -> aes_mbs .length ))
4239
4299
return (-1 );
4240
- aes -> aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS ;
4300
+ #endif
4241
4301
4242
4302
/* All conversions succeeded. */
4303
+ aes -> aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS ;
4304
+
4243
4305
return (0 );
4244
4306
}
0 commit comments