Skip to content

Commit b4a1991

Browse files
committed
Merge branch 'string-optimization' of https://github.com/dunhor/libarchive into 7zip-write-unicode
2 parents ffa43ae + 0c61a9a commit b4a1991

File tree

2 files changed

+164
-11
lines changed

2 files changed

+164
-11
lines changed

Diff for: libarchive/archive_string.c

+73-11
Original file line numberDiff line numberDiff line change
@@ -3874,6 +3874,30 @@ archive_mstring_get_utf8(struct archive *a, struct archive_mstring *aes,
38743874
}
38753875

38763876
*p = NULL;
3877+
#if defined(_WIN32) && !defined(__CYGWIN__)
3878+
/*
3879+
* On Windows, first try converting from WCS because (1) there's no
3880+
* guarantee that the conversion to MBS will succeed, e.g. when using
3881+
* CP_ACP, and (2) that's more efficient than converting to MBS, just to
3882+
* convert back to WCS again before finally converting to UTF-8
3883+
*/
3884+
if ((aes->aes_set & AES_SET_WCS) != 0) {
3885+
sc = archive_string_conversion_to_charset(a, "UTF-8", 1);
3886+
if (sc == NULL)
3887+
return (-1);/* Couldn't allocate memory for sc. */
3888+
archive_string_empty(&(aes->aes_utf8));
3889+
r = archive_string_append_from_wcs_in_codepage(&(aes->aes_utf8),
3890+
aes->aes_wcs.s, aes->aes_wcs.length, sc);
3891+
if (a == NULL)
3892+
free_sconv_object(sc);
3893+
if (r == 0) {
3894+
aes->aes_set |= AES_SET_UTF8;
3895+
*p = aes->aes_utf8.s;
3896+
return (0);/* success. */
3897+
} else
3898+
return (-1);/* failure. */
3899+
}
3900+
#endif
38773901
/* Try converting WCS to MBS first if MBS does not exist yet. */
38783902
if ((aes->aes_set & AES_SET_MBS) == 0) {
38793903
const char *pm; /* unused */
@@ -3958,6 +3982,32 @@ archive_mstring_get_wcs(struct archive *a, struct archive_mstring *aes,
39583982
}
39593983

39603984
*wp = NULL;
3985+
#if defined(_WIN32) && !defined(__CYGWIN__)
3986+
/*
3987+
* On Windows, prefer converting from UTF-8 directly to WCS because:
3988+
* (1) there's no guarantee that the string can be represented in MBS (e.g.
3989+
* with CP_ACP), and (2) in order to convert from UTF-8 to MBS, we're going
3990+
* to need to convert from UTF-8 to WCS anyway and its wasteful to throw
3991+
* away that intermediate result
3992+
*/
3993+
if (aes->aes_set & AES_SET_UTF8) {
3994+
struct archive_string_conv *sc;
3995+
3996+
sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
3997+
if (sc != NULL) {
3998+
archive_wstring_empty((&aes->aes_wcs));
3999+
r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
4000+
aes->aes_utf8.s, aes->aes_utf8.length, sc);
4001+
if (a == NULL)
4002+
free_sconv_object(sc);
4003+
if (r == 0) {
4004+
aes->aes_set |= AES_SET_WCS;
4005+
*wp = aes->aes_wcs.s;
4006+
return (0);
4007+
}
4008+
}
4009+
}
4010+
#endif
39614011
/* Try converting UTF8 to MBS first if MBS does not exist yet. */
39624012
if ((aes->aes_set & AES_SET_MBS) == 0) {
39634013
const char *p; /* unused */
@@ -4211,21 +4261,31 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
42114261

42124262
aes->aes_set = AES_SET_UTF8; /* Only UTF8 is set now. */
42134263

4214-
/* Try converting UTF-8 to MBS, return false on failure. */
42154264
sc = archive_string_conversion_from_charset(a, "UTF-8", 1);
42164265
if (sc == NULL)
42174266
return (-1);/* Couldn't allocate memory for sc. */
4218-
r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
42194267

42204268
#if defined(_WIN32) && !defined(__CYGWIN__)
4221-
/* On failure, make an effort to convert UTF8 to WCS as the active code page
4222-
* may not be able to represent all characters in the string */
4223-
if (r != 0) {
4224-
if (archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
4225-
aes->aes_utf8.s, aes->aes_utf8.length, sc) == 0)
4226-
aes->aes_set = AES_SET_UTF8 | AES_SET_WCS;
4227-
}
4228-
#endif
4269+
/* On Windows, there's no good way to convert from UTF8 -> MBS directly, so
4270+
* prefer to first convert to WCS as (1) it's wasteful to throw away the
4271+
* intermediate result, and (2) WCS will still be set even if we fail to
4272+
* convert to MBS (e.g. with ACP that can't represent the characters) */
4273+
r = archive_wstring_append_from_mbs_in_codepage(&(aes->aes_wcs),
4274+
aes->aes_utf8.s, aes->aes_utf8.length, sc);
4275+
4276+
if (a == NULL)
4277+
free_sconv_object(sc);
4278+
if (r != 0)
4279+
return (-1); /* This will guarantee we can't convert to MBS */
4280+
aes->aes_set = AES_SET_UTF8 | AES_SET_WCS; /* Both UTF8 and WCS set. */
4281+
4282+
/* Try converting WCS to MBS, return false on failure. */
4283+
if (archive_string_append_from_wcs(&(aes->aes_mbs), aes->aes_wcs.s,
4284+
aes->aes_wcs.length))
4285+
return (-1);
4286+
#else
4287+
/* Try converting UTF-8 to MBS, return false on failure. */
4288+
r = archive_strcpy_l(&(aes->aes_mbs), utf8, sc);
42294289

42304290
if (a == NULL)
42314291
free_sconv_object(sc);
@@ -4237,8 +4297,10 @@ archive_mstring_update_utf8(struct archive *a, struct archive_mstring *aes,
42374297
if (archive_wstring_append_from_mbs(&(aes->aes_wcs), aes->aes_mbs.s,
42384298
aes->aes_mbs.length))
42394299
return (-1);
4240-
aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
4300+
#endif
42414301

42424302
/* All conversions succeeded. */
4303+
aes->aes_set = AES_SET_UTF8 | AES_SET_WCS | AES_SET_MBS;
4304+
42434305
return (0);
42444306
}

Diff for: libarchive/test/test_archive_string_conversion.c

+91
Original file line numberDiff line numberDiff line change
@@ -882,3 +882,94 @@ DEFINE_TEST(test_archive_string_conversion)
882882
test_archive_string_canonicalization();
883883
test_archive_string_set_get();
884884
}
885+
886+
DEFINE_TEST(test_archive_string_update_utf8_win)
887+
{
888+
#if !defined(_WIN32) || defined(__CYGWIN__)
889+
skipping("This test is meant to verify unicode string handling on Windows"
890+
" with the C locale");
891+
#else
892+
static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
893+
static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
894+
struct archive_mstring mstr;
895+
int r;
896+
897+
memset(&mstr, 0, sizeof(mstr));
898+
899+
r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
900+
901+
/* On Windows, this should reliably fail with the C locale */
902+
assertEqualInt(-1, r);
903+
assertEqualInt(0, mstr.aes_set & AES_SET_MBS);
904+
905+
/* NOTE: We access the internals to validate that they were set by the
906+
* 'archive_mstring_update_utf8' function */
907+
/* UTF-8 should always be set */
908+
assertEqualInt(AES_SET_UTF8, mstr.aes_set & AES_SET_UTF8);
909+
assertEqualString(utf8_string, mstr.aes_utf8.s);
910+
/* WCS should always be set as well */
911+
assertEqualInt(AES_SET_WCS, mstr.aes_set & AES_SET_WCS);
912+
assertEqualWString(wcs_string, mstr.aes_wcs.s);
913+
914+
archive_mstring_clean(&mstr);
915+
#endif
916+
}
917+
918+
DEFINE_TEST(test_archive_string_update_utf8_utf8)
919+
{
920+
static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
921+
static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
922+
struct archive_mstring mstr;
923+
int r;
924+
925+
memset(&mstr, 0, sizeof(mstr));
926+
927+
if (setlocale(LC_ALL, "en_US.UTF-8") == NULL) {
928+
skipping("UTF-8 not supported on this system.");
929+
return;
930+
}
931+
932+
r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
933+
934+
/* All conversions should have succeeded */
935+
assertEqualInt(0, r);
936+
assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
937+
assertEqualString(utf8_string, mstr.aes_utf8.s);
938+
assertEqualString(utf8_string, mstr.aes_mbs.s);
939+
assertEqualWString(wcs_string, mstr.aes_wcs.s);
940+
941+
archive_mstring_clean(&mstr);
942+
}
943+
944+
DEFINE_TEST(test_archive_string_update_utf8_koi8)
945+
{
946+
static const char utf8_string[] = "\xD0\xBF\xD1\x80\xD0\xB8";
947+
static const char koi8_string[] = "\xD0\xD2\xC9";
948+
static const wchar_t wcs_string[] = L"\U0000043f\U00000440\U00000438";
949+
struct archive_mstring mstr;
950+
int r;
951+
952+
memset(&mstr, 0, sizeof(mstr));
953+
954+
if (setlocale(LC_ALL, "ru_RU.KOI8-R") == NULL) {
955+
skipping("KOI8-R locale not available on this system.");
956+
return;
957+
}
958+
959+
r = archive_mstring_update_utf8(NULL, &mstr, utf8_string);
960+
961+
/* All conversions should have succeeded */
962+
assertEqualInt(0, r);
963+
assertEqualInt(AES_SET_MBS | AES_SET_WCS | AES_SET_UTF8, mstr.aes_set);
964+
assertEqualString(utf8_string, mstr.aes_utf8.s);
965+
assertEqualString(koi8_string, mstr.aes_mbs.s);
966+
#if defined(_WIN32) && !defined(__CYGWIN__)
967+
assertEqualWString(wcs_string, mstr.aes_wcs.s);
968+
#else
969+
/* No guarantee of how WCS strings behave, however this test test is
970+
* primarily meant for Windows */
971+
(void)wcs_string;
972+
#endif
973+
974+
archive_mstring_clean(&mstr);
975+
}

0 commit comments

Comments
 (0)