diff options
Diffstat (limited to '')
| -rw-r--r-- | src/common/string_util.cpp | 158 |
1 files changed, 110 insertions, 48 deletions
diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp index 61f0939c4..54943d306 100644 --- a/src/common/string_util.cpp +++ b/src/common/string_util.cpp | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | 9 | ||
| 10 | #ifdef _WIN32 | 10 | #ifdef _WIN32 |
| 11 | #include <Windows.h> | 11 | #include <Windows.h> |
| 12 | #include <codecvt> | ||
| 12 | #else | 13 | #else |
| 13 | #include <iconv.h> | 14 | #include <iconv.h> |
| 14 | #endif | 15 | #endif |
| @@ -411,7 +412,19 @@ std::string UriEncode(const std::string & sSrc) | |||
| 411 | 412 | ||
| 412 | #ifdef _WIN32 | 413 | #ifdef _WIN32 |
| 413 | 414 | ||
| 414 | std::string UTF16ToUTF8(const std::wstring& input) | 415 | std::string UTF16ToUTF8(const std::u16string& input) |
| 416 | { | ||
| 417 | std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert; | ||
| 418 | return convert.to_bytes(input); | ||
| 419 | } | ||
| 420 | |||
| 421 | std::u16string UTF8ToUTF16(const std::string& input) | ||
| 422 | { | ||
| 423 | std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert; | ||
| 424 | return convert.from_bytes(input); | ||
| 425 | } | ||
| 426 | |||
| 427 | static std::string UTF16ToUTF8(const std::wstring& input) | ||
| 415 | { | 428 | { |
| 416 | auto const size = WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), nullptr, 0, nullptr, nullptr); | 429 | auto const size = WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), nullptr, 0, nullptr, nullptr); |
| 417 | 430 | ||
| @@ -424,7 +437,7 @@ std::string UTF16ToUTF8(const std::wstring& input) | |||
| 424 | return output; | 437 | return output; |
| 425 | } | 438 | } |
| 426 | 439 | ||
| 427 | std::wstring CPToUTF16(u32 code_page, const std::string& input) | 440 | static std::wstring CPToUTF16(u32 code_page, const std::string& input) |
| 428 | { | 441 | { |
| 429 | auto const size = MultiByteToWideChar(code_page, 0, input.data(), input.size(), nullptr, 0); | 442 | auto const size = MultiByteToWideChar(code_page, 0, input.data(), input.size(), nullptr, 0); |
| 430 | 443 | ||
| @@ -437,7 +450,7 @@ std::wstring CPToUTF16(u32 code_page, const std::string& input) | |||
| 437 | return output; | 450 | return output; |
| 438 | } | 451 | } |
| 439 | 452 | ||
| 440 | std::wstring UTF8ToUTF16(const std::string& input) | 453 | std::wstring UTF8ToUTF16W(const std::string &input) |
| 441 | { | 454 | { |
| 442 | return CPToUTF16(CP_UTF8, input); | 455 | return CPToUTF16(CP_UTF8, input); |
| 443 | } | 456 | } |
| @@ -455,61 +468,123 @@ std::string CP1252ToUTF8(const std::string& input) | |||
| 455 | #else | 468 | #else |
| 456 | 469 | ||
| 457 | template <typename T> | 470 | template <typename T> |
| 458 | std::string CodeToUTF8(const char* fromcode, const std::basic_string<T>& input) | 471 | static std::string CodeToUTF8(const char* fromcode, const std::basic_string<T>& input) |
| 459 | { | 472 | { |
| 460 | std::string result; | 473 | std::string result; |
| 461 | 474 | ||
| 462 | iconv_t const conv_desc = iconv_open("UTF-8", fromcode); | 475 | iconv_t const conv_desc = iconv_open("UTF-8", fromcode); |
| 463 | if ((iconv_t)-1 == conv_desc) | 476 | if ((iconv_t)(-1) == conv_desc) |
| 464 | { | 477 | { |
| 465 | ERROR_LOG(COMMON, "Iconv initialization failure [%s]: %s", fromcode, strerror(errno)); | 478 | ERROR_LOG(COMMON, "Iconv initialization failure [%s]: %s", fromcode, strerror(errno)); |
| 479 | iconv_close(conv_desc); | ||
| 480 | return {}; | ||
| 466 | } | 481 | } |
| 467 | else | ||
| 468 | { | ||
| 469 | size_t const in_bytes = sizeof(T) * input.size(); | ||
| 470 | size_t const out_buffer_size = 4 * in_bytes; | ||
| 471 | 482 | ||
| 472 | std::string out_buffer; | 483 | const size_t in_bytes = sizeof(T) * input.size(); |
| 473 | out_buffer.resize(out_buffer_size); | 484 | // Multiply by 4, which is the max number of bytes to encode a codepoint |
| 485 | const size_t out_buffer_size = 4 * in_bytes; | ||
| 474 | 486 | ||
| 475 | auto src_buffer = &input[0]; | 487 | std::string out_buffer; |
| 476 | size_t src_bytes = in_bytes; | 488 | out_buffer.resize(out_buffer_size); |
| 477 | auto dst_buffer = &out_buffer[0]; | ||
| 478 | size_t dst_bytes = out_buffer.size(); | ||
| 479 | 489 | ||
| 480 | while (src_bytes != 0) | 490 | auto src_buffer = &input[0]; |
| 481 | { | 491 | size_t src_bytes = in_bytes; |
| 482 | size_t const iconv_result = iconv(conv_desc, (char**)(&src_buffer), &src_bytes, | 492 | auto dst_buffer = &out_buffer[0]; |
| 483 | &dst_buffer, &dst_bytes); | 493 | size_t dst_bytes = out_buffer.size(); |
| 484 | 494 | ||
| 485 | if ((size_t)-1 == iconv_result) | 495 | while (0 != src_bytes) |
| 496 | { | ||
| 497 | size_t const iconv_result = iconv(conv_desc, (char**)(&src_buffer), &src_bytes, | ||
| 498 | &dst_buffer, &dst_bytes); | ||
| 499 | |||
| 500 | if (static_cast<size_t>(-1) == iconv_result) | ||
| 501 | { | ||
| 502 | if (EILSEQ == errno || EINVAL == errno) | ||
| 486 | { | 503 | { |
| 487 | if (EILSEQ == errno || EINVAL == errno) | 504 | // Try to skip the bad character |
| 505 | if (0 != src_bytes) | ||
| 488 | { | 506 | { |
| 489 | // Try to skip the bad character | 507 | --src_bytes; |
| 490 | if (src_bytes != 0) | 508 | ++src_buffer; |
| 491 | { | ||
| 492 | --src_bytes; | ||
| 493 | ++src_buffer; | ||
| 494 | } | ||
| 495 | } | ||
| 496 | else | ||
| 497 | { | ||
| 498 | ERROR_LOG(COMMON, "iconv failure [%s]: %s", fromcode, strerror(errno)); | ||
| 499 | break; | ||
| 500 | } | 509 | } |
| 501 | } | 510 | } |
| 511 | else | ||
| 512 | { | ||
| 513 | ERROR_LOG(COMMON, "iconv failure [%s]: %s", fromcode, strerror(errno)); | ||
| 514 | break; | ||
| 515 | } | ||
| 502 | } | 516 | } |
| 517 | } | ||
| 503 | 518 | ||
| 504 | out_buffer.resize(out_buffer_size - dst_bytes); | 519 | out_buffer.resize(out_buffer_size - dst_bytes); |
| 505 | out_buffer.swap(result); | 520 | out_buffer.swap(result); |
| 506 | 521 | ||
| 522 | iconv_close(conv_desc); | ||
| 523 | |||
| 524 | return result; | ||
| 525 | } | ||
| 526 | |||
| 527 | std::u16string UTF8ToUTF16(const std::string& input) | ||
| 528 | { | ||
| 529 | std::u16string result; | ||
| 530 | |||
| 531 | iconv_t const conv_desc = iconv_open("UTF-16", "UTF-8"); | ||
| 532 | if ((iconv_t)(-1) == conv_desc) | ||
| 533 | { | ||
| 534 | ERROR_LOG(COMMON, "Iconv initialization failure [UTF-8]: %s", strerror(errno)); | ||
| 507 | iconv_close(conv_desc); | 535 | iconv_close(conv_desc); |
| 536 | return {}; | ||
| 508 | } | 537 | } |
| 538 | |||
| 539 | const size_t in_bytes = sizeof(char) * input.size(); | ||
| 540 | // Multiply by 4, which is the max number of bytes to encode a codepoint | ||
| 541 | const size_t out_buffer_size = 4 * sizeof(char16_t) * in_bytes; | ||
| 542 | |||
| 543 | std::u16string out_buffer; | ||
| 544 | out_buffer.resize(out_buffer_size); | ||
| 545 | |||
| 546 | char* src_buffer = const_cast<char*>(&input[0]); | ||
| 547 | size_t src_bytes = in_bytes; | ||
| 548 | char* dst_buffer = (char*)(&out_buffer[0]); | ||
| 549 | size_t dst_bytes = out_buffer.size(); | ||
| 550 | |||
| 551 | while (0 != src_bytes) | ||
| 552 | { | ||
| 553 | size_t const iconv_result = iconv(conv_desc, &src_buffer, &src_bytes, | ||
| 554 | &dst_buffer, &dst_bytes); | ||
| 555 | |||
| 556 | if (static_cast<size_t>(-1) == iconv_result) | ||
| 557 | { | ||
| 558 | if (EILSEQ == errno || EINVAL == errno) | ||
| 559 | { | ||
| 560 | // Try to skip the bad character | ||
| 561 | if (0 != src_bytes) | ||
| 562 | { | ||
| 563 | --src_bytes; | ||
| 564 | ++src_buffer; | ||
| 565 | } | ||
| 566 | } | ||
| 567 | else | ||
| 568 | { | ||
| 569 | ERROR_LOG(COMMON, "iconv failure [UTF-8]: %s", strerror(errno)); | ||
| 570 | break; | ||
| 571 | } | ||
| 572 | } | ||
| 573 | } | ||
| 574 | |||
| 575 | out_buffer.resize(out_buffer_size - dst_bytes); | ||
| 576 | out_buffer.swap(result); | ||
| 577 | |||
| 578 | iconv_close(conv_desc); | ||
| 509 | 579 | ||
| 510 | return result; | 580 | return result; |
| 511 | } | 581 | } |
| 512 | 582 | ||
| 583 | std::string UTF16ToUTF8(const std::u16string& input) | ||
| 584 | { | ||
| 585 | return CodeToUTF8("UTF-16", input); | ||
| 586 | } | ||
| 587 | |||
| 513 | std::string CP1252ToUTF8(const std::string& input) | 588 | std::string CP1252ToUTF8(const std::string& input) |
| 514 | { | 589 | { |
| 515 | //return CodeToUTF8("CP1252//TRANSLIT", input); | 590 | //return CodeToUTF8("CP1252//TRANSLIT", input); |
| @@ -523,19 +598,6 @@ std::string SHIFTJISToUTF8(const std::string& input) | |||
| 523 | return CodeToUTF8("SJIS", input); | 598 | return CodeToUTF8("SJIS", input); |
| 524 | } | 599 | } |
| 525 | 600 | ||
| 526 | std::string UTF16ToUTF8(const std::wstring& input) | ||
| 527 | { | ||
| 528 | std::string result = | ||
| 529 | // CodeToUTF8("UCS-2", input); | ||
| 530 | // CodeToUTF8("UCS-2LE", input); | ||
| 531 | // CodeToUTF8("UTF-16", input); | ||
| 532 | CodeToUTF8("UTF-16LE", input); | ||
| 533 | |||
| 534 | // TODO: why is this needed? | ||
| 535 | result.erase(std::remove(result.begin(), result.end(), 0x00), result.end()); | ||
| 536 | return result; | ||
| 537 | } | ||
| 538 | |||
| 539 | #endif | 601 | #endif |
| 540 | 602 | ||
| 541 | } | 603 | } |