summaryrefslogtreecommitdiff
path: root/src/common/string_util.cpp
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/common/string_util.cpp158
1 files changed, 110 insertions, 48 deletions
diff --git a/src/common/string_util.cpp b/src/common/string_util.cpp
index 61f0939c4..54943d306 100644
--- a/src/common/string_util.cpp
+++ b/src/common/string_util.cpp
@@ -9,6 +9,7 @@
9 9
10#ifdef _WIN32 10#ifdef _WIN32
11 #include <Windows.h> 11 #include <Windows.h>
12 #include <codecvt>
12#else 13#else
13 #include <iconv.h> 14 #include <iconv.h>
14#endif 15#endif
@@ -411,7 +412,19 @@ std::string UriEncode(const std::string & sSrc)
411 412
412#ifdef _WIN32 413#ifdef _WIN32
413 414
414std::string UTF16ToUTF8(const std::wstring& input) 415std::string UTF16ToUTF8(const std::u16string& input)
416{
417 std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
418 return convert.to_bytes(input);
419}
420
421std::u16string UTF8ToUTF16(const std::string& input)
422{
423 std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> convert;
424 return convert.from_bytes(input);
425}
426
427static std::string UTF16ToUTF8(const std::wstring& input)
415{ 428{
416 auto const size = WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), nullptr, 0, nullptr, nullptr); 429 auto const size = WideCharToMultiByte(CP_UTF8, 0, input.data(), input.size(), nullptr, 0, nullptr, nullptr);
417 430
@@ -424,7 +437,7 @@ std::string UTF16ToUTF8(const std::wstring& input)
424 return output; 437 return output;
425} 438}
426 439
427std::wstring CPToUTF16(u32 code_page, const std::string& input) 440static std::wstring CPToUTF16(u32 code_page, const std::string& input)
428{ 441{
429 auto const size = MultiByteToWideChar(code_page, 0, input.data(), input.size(), nullptr, 0); 442 auto const size = MultiByteToWideChar(code_page, 0, input.data(), input.size(), nullptr, 0);
430 443
@@ -437,7 +450,7 @@ std::wstring CPToUTF16(u32 code_page, const std::string& input)
437 return output; 450 return output;
438} 451}
439 452
440std::wstring UTF8ToUTF16(const std::string& input) 453std::wstring UTF8ToUTF16W(const std::string &input)
441{ 454{
442 return CPToUTF16(CP_UTF8, input); 455 return CPToUTF16(CP_UTF8, input);
443} 456}
@@ -455,61 +468,123 @@ std::string CP1252ToUTF8(const std::string& input)
455#else 468#else
456 469
457template <typename T> 470template <typename T>
458std::string CodeToUTF8(const char* fromcode, const std::basic_string<T>& input) 471static std::string CodeToUTF8(const char* fromcode, const std::basic_string<T>& input)
459{ 472{
460 std::string result; 473 std::string result;
461 474
462 iconv_t const conv_desc = iconv_open("UTF-8", fromcode); 475 iconv_t const conv_desc = iconv_open("UTF-8", fromcode);
463 if ((iconv_t)-1 == conv_desc) 476 if ((iconv_t)(-1) == conv_desc)
464 { 477 {
465 ERROR_LOG(COMMON, "Iconv initialization failure [%s]: %s", fromcode, strerror(errno)); 478 ERROR_LOG(COMMON, "Iconv initialization failure [%s]: %s", fromcode, strerror(errno));
479 iconv_close(conv_desc);
480 return {};
466 } 481 }
467 else
468 {
469 size_t const in_bytes = sizeof(T) * input.size();
470 size_t const out_buffer_size = 4 * in_bytes;
471 482
472 std::string out_buffer; 483 const size_t in_bytes = sizeof(T) * input.size();
473 out_buffer.resize(out_buffer_size); 484 // Multiply by 4, which is the max number of bytes to encode a codepoint
485 const size_t out_buffer_size = 4 * in_bytes;
474 486
475 auto src_buffer = &input[0]; 487 std::string out_buffer;
476 size_t src_bytes = in_bytes; 488 out_buffer.resize(out_buffer_size);
477 auto dst_buffer = &out_buffer[0];
478 size_t dst_bytes = out_buffer.size();
479 489
480 while (src_bytes != 0) 490 auto src_buffer = &input[0];
481 { 491 size_t src_bytes = in_bytes;
482 size_t const iconv_result = iconv(conv_desc, (char**)(&src_buffer), &src_bytes, 492 auto dst_buffer = &out_buffer[0];
483 &dst_buffer, &dst_bytes); 493 size_t dst_bytes = out_buffer.size();
484 494
485 if ((size_t)-1 == iconv_result) 495 while (0 != src_bytes)
496 {
497 size_t const iconv_result = iconv(conv_desc, (char**)(&src_buffer), &src_bytes,
498 &dst_buffer, &dst_bytes);
499
500 if (static_cast<size_t>(-1) == iconv_result)
501 {
502 if (EILSEQ == errno || EINVAL == errno)
486 { 503 {
487 if (EILSEQ == errno || EINVAL == errno) 504 // Try to skip the bad character
505 if (0 != src_bytes)
488 { 506 {
489 // Try to skip the bad character 507 --src_bytes;
490 if (src_bytes != 0) 508 ++src_buffer;
491 {
492 --src_bytes;
493 ++src_buffer;
494 }
495 }
496 else
497 {
498 ERROR_LOG(COMMON, "iconv failure [%s]: %s", fromcode, strerror(errno));
499 break;
500 } 509 }
501 } 510 }
511 else
512 {
513 ERROR_LOG(COMMON, "iconv failure [%s]: %s", fromcode, strerror(errno));
514 break;
515 }
502 } 516 }
517 }
503 518
504 out_buffer.resize(out_buffer_size - dst_bytes); 519 out_buffer.resize(out_buffer_size - dst_bytes);
505 out_buffer.swap(result); 520 out_buffer.swap(result);
506 521
522 iconv_close(conv_desc);
523
524 return result;
525}
526
527std::u16string UTF8ToUTF16(const std::string& input)
528{
529 std::u16string result;
530
531 iconv_t const conv_desc = iconv_open("UTF-16", "UTF-8");
532 if ((iconv_t)(-1) == conv_desc)
533 {
534 ERROR_LOG(COMMON, "Iconv initialization failure [UTF-8]: %s", strerror(errno));
507 iconv_close(conv_desc); 535 iconv_close(conv_desc);
536 return {};
508 } 537 }
538
539 const size_t in_bytes = sizeof(char) * input.size();
540 // Multiply by 4, which is the max number of bytes to encode a codepoint
541 const size_t out_buffer_size = 4 * sizeof(char16_t) * in_bytes;
542
543 std::u16string out_buffer;
544 out_buffer.resize(out_buffer_size);
545
546 char* src_buffer = const_cast<char*>(&input[0]);
547 size_t src_bytes = in_bytes;
548 char* dst_buffer = (char*)(&out_buffer[0]);
549 size_t dst_bytes = out_buffer.size();
550
551 while (0 != src_bytes)
552 {
553 size_t const iconv_result = iconv(conv_desc, &src_buffer, &src_bytes,
554 &dst_buffer, &dst_bytes);
555
556 if (static_cast<size_t>(-1) == iconv_result)
557 {
558 if (EILSEQ == errno || EINVAL == errno)
559 {
560 // Try to skip the bad character
561 if (0 != src_bytes)
562 {
563 --src_bytes;
564 ++src_buffer;
565 }
566 }
567 else
568 {
569 ERROR_LOG(COMMON, "iconv failure [UTF-8]: %s", strerror(errno));
570 break;
571 }
572 }
573 }
574
575 out_buffer.resize(out_buffer_size - dst_bytes);
576 out_buffer.swap(result);
577
578 iconv_close(conv_desc);
509 579
510 return result; 580 return result;
511} 581}
512 582
583std::string UTF16ToUTF8(const std::u16string& input)
584{
585 return CodeToUTF8("UTF-16", input);
586}
587
513std::string CP1252ToUTF8(const std::string& input) 588std::string CP1252ToUTF8(const std::string& input)
514{ 589{
515 //return CodeToUTF8("CP1252//TRANSLIT", input); 590 //return CodeToUTF8("CP1252//TRANSLIT", input);
@@ -523,19 +598,6 @@ std::string SHIFTJISToUTF8(const std::string& input)
523 return CodeToUTF8("SJIS", input); 598 return CodeToUTF8("SJIS", input);
524} 599}
525 600
526std::string UTF16ToUTF8(const std::wstring& input)
527{
528 std::string result =
529 // CodeToUTF8("UCS-2", input);
530 // CodeToUTF8("UCS-2LE", input);
531 // CodeToUTF8("UTF-16", input);
532 CodeToUTF8("UTF-16LE", input);
533
534 // TODO: why is this needed?
535 result.erase(std::remove(result.begin(), result.end(), 0x00), result.end());
536 return result;
537}
538
539#endif 601#endif
540 602
541} 603}