如何将包含 utf-16 编码文本的 std::string 转换为 utf-16 wstring?

发布于 2024-12-02 01:22:38 字数 6394 浏览 1 评论 0原文

因此,我们得到一个类似 Новая папка 的字符串,它是 utf-16 编码行的 utf-8 表示(utf-16 中的 Новая папка),我们想要将此字符串转换为 wstring 而不更改编码.. 意思是字面上将所有数据从 string 转换为 wstring 而不进行任何转换。所以我们会得到带有 Новая папка 内容的 wstring。这样的事该怎么办呢?

更新: 我的意思是 - 我们拥有字符串内正确的 utf-16 字符串的所有数据。我们需要的只是将该数据放入 wstring...这意味着如果 wstring 包含 wchar(可能恰好是 0000),我们就必须放入 2 个字符串字符 0000 一起得到它。那就是我不知道该怎么做。

更新2 我是如何到达这里的 - 我有义务在我的服务器上使用的 C++ 库是 C 风格的解析器。它以 std::string 的形式返回用户请求地址。当我让我的客户以这种格式向我发送请求时。

url_encode(UTF16toUTF8(wstring)) //pseudocode.

当然

string UTF16toUTF8(const wstring & in)
{
    string out;
    unsigned int codepoint;
    bool completecode = false;
    for (wstring::const_iterator p = in.begin();  p != in.end();  ++p)
    {
        if (*p >= 0xd800 && *p <= 0xdbff)
        {
            codepoint = ((*p - 0xd800) << 10) + 0x10000;
            completecode = false;
        }
        else if (!completecode && *p >= 0xdc00 && *p <= 0xdfff)
        {
            codepoint |= *p - 0xdc00;
            completecode = true;
        }
        else
        {
            codepoint = *p;
            completecode = true;
        }
        if (completecode)
        {
            if (codepoint <= 0x7f)
                out.push_back(codepoint);
            else if (codepoint <= 0x7ff)
            {
                out.push_back(0xc0 | ((codepoint >> 6) & 0x1f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
            else if (codepoint <= 0xffff)
            {
                out.push_back(0xe0 | ((codepoint >> 12) & 0x0f));
                out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
            else
            {
                out.push_back(0xf0 | ((codepoint >> 18) & 0x07));
                out.push_back(0x80 | ((codepoint >> 12) & 0x3f));
                out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
        }
    }
    return out;
}

std::string url_encode( std::string sSrc )
{
    const char SAFE[256] =
    {
        /*      0 1 2 3  4 5 6 7  8 9 A B  C D E F */
        /* 0 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 1 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 2 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 3 */ 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,

        /* 4 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
        /* 5 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
        /* 6 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
        /* 7 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,

        /* 8 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 9 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* A */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* B */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,

        /* C */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* D */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* E */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* F */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
    };
    const char DEC2HEX[16 + 1] = "0123456789ABCDEF";
    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const int SRC_LEN = sSrc.length();
    unsigned char * const pStart = new unsigned char[SRC_LEN * 3];
    unsigned char * pEnd = pStart;
    const unsigned char * const SRC_END = pSrc + SRC_LEN;

    for (; pSrc < SRC_END; ++pSrc)
    {
        if (SAFE[*pSrc]) 
            *pEnd++ = *pSrc;
        else
        {
            // escape this char
            *pEnd++ = '%';
            *pEnd++ = DEC2HEX[*pSrc >> 4];
            *pEnd++ = DEC2HEX[*pSrc & 0x0F];
        }
    }

    std::string sResult((char *)pStart, (char *)pEnd);
    delete [] pStart;
    return sResult;
}

std::string url_decode( std::string sSrc )
{
    // Note from RFC1630:  "Sequences which start with a percent sign
    // but are not followed by two hexadecimal characters (0-9, A-F) are reserved
    // for future extension"

    const char HEX2DEC[256] = 
    {
        /*       0  1  2  3   4  5  6  7   8  9  A  B   C  D  E  F */
        /* 0 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 1 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 2 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 3 */  0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,

        /* 4 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 5 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 6 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 7 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,

        /* 8 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 9 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* A */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* B */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,

        /* C */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* D */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* E */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* F */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
    };

    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const int SRC_LEN = sSrc.length();
    const unsigned char * const SRC_END = pSrc + SRC_LEN;
    const unsigned char * const SRC_LAST_DEC = SRC_END - 2;   // last decodable '%' 

    char * const pStart = new char[SRC_LEN];
    char * pEnd = pStart;

    while (pSrc < SRC_LAST_DEC)
    {
        if (*pSrc == '%')
        {
            char dec1, dec2;
            if (-1 != (dec1 = HEX2DEC[*(pSrc + 1)])
                && -1 != (dec2 = HEX2DEC[*(pSrc + 2)]))
            {
                *pEnd++ = (dec1 << 4) + dec2;
                pSrc += 3;
                continue;
            }
        }

        *pEnd++ = *pSrc++;
    }

    // the last 2- chars
    while (pSrc < SRC_END)
        *pEnd++ = *pSrc++;

    std::string sResult(pStart, pEnd);
    delete [] pStart;
    return sResult;
}

,我调用 url_decode,但我得到一个字符串..( 所以我希望现在我的问题更清楚了。

So we get a string like Новая папка which is utf-8 representation of utf-16 encoded line (Новая папка in utf-16) we want to turn this string into wstring not changing encoding.. meaning literally bring all data from string to wstring with out any conversion. So we would get wstring with Новая папка contents. How to do such thing?

Update:
What I meant to say - we have all data for correct utf-16 string inside of string. All we need is to put that data into wstring... that means if wstring contains of wchar which could happen to be 0000 we would have to put 2 string chars 00 and 00 together to get it. That is what I do not know how to do.

Update2
How I got here - a C++ lib I am obligated to use on my server is C style parser. and it returns me user request adress as std::string. while I make my clients send to me requests in such format.

url_encode(UTF16toUTF8(wstring)) //pseudocode.

where

string UTF16toUTF8(const wstring & in)
{
    string out;
    unsigned int codepoint;
    bool completecode = false;
    for (wstring::const_iterator p = in.begin();  p != in.end();  ++p)
    {
        if (*p >= 0xd800 && *p <= 0xdbff)
        {
            codepoint = ((*p - 0xd800) << 10) + 0x10000;
            completecode = false;
        }
        else if (!completecode && *p >= 0xdc00 && *p <= 0xdfff)
        {
            codepoint |= *p - 0xdc00;
            completecode = true;
        }
        else
        {
            codepoint = *p;
            completecode = true;
        }
        if (completecode)
        {
            if (codepoint <= 0x7f)
                out.push_back(codepoint);
            else if (codepoint <= 0x7ff)
            {
                out.push_back(0xc0 | ((codepoint >> 6) & 0x1f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
            else if (codepoint <= 0xffff)
            {
                out.push_back(0xe0 | ((codepoint >> 12) & 0x0f));
                out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
            else
            {
                out.push_back(0xf0 | ((codepoint >> 18) & 0x07));
                out.push_back(0x80 | ((codepoint >> 12) & 0x3f));
                out.push_back(0x80 | ((codepoint >> 6) & 0x3f));
                out.push_back(0x80 | (codepoint & 0x3f));
            }
        }
    }
    return out;
}

std::string url_encode( std::string sSrc )
{
    const char SAFE[256] =
    {
        /*      0 1 2 3  4 5 6 7  8 9 A B  C D E F */
        /* 0 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 1 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 2 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 3 */ 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0,

        /* 4 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
        /* 5 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,
        /* 6 */ 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1,
        /* 7 */ 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0,

        /* 8 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* 9 */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* A */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* B */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,

        /* C */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* D */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* E */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0,
        /* F */ 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0
    };
    const char DEC2HEX[16 + 1] = "0123456789ABCDEF";
    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const int SRC_LEN = sSrc.length();
    unsigned char * const pStart = new unsigned char[SRC_LEN * 3];
    unsigned char * pEnd = pStart;
    const unsigned char * const SRC_END = pSrc + SRC_LEN;

    for (; pSrc < SRC_END; ++pSrc)
    {
        if (SAFE[*pSrc]) 
            *pEnd++ = *pSrc;
        else
        {
            // escape this char
            *pEnd++ = '%';
            *pEnd++ = DEC2HEX[*pSrc >> 4];
            *pEnd++ = DEC2HEX[*pSrc & 0x0F];
        }
    }

    std::string sResult((char *)pStart, (char *)pEnd);
    delete [] pStart;
    return sResult;
}

std::string url_decode( std::string sSrc )
{
    // Note from RFC1630:  "Sequences which start with a percent sign
    // but are not followed by two hexadecimal characters (0-9, A-F) are reserved
    // for future extension"

    const char HEX2DEC[256] = 
    {
        /*       0  1  2  3   4  5  6  7   8  9  A  B   C  D  E  F */
        /* 0 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 1 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 2 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 3 */  0, 1, 2, 3,  4, 5, 6, 7,  8, 9,-1,-1, -1,-1,-1,-1,

        /* 4 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 5 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 6 */ -1,10,11,12, 13,14,15,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 7 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,

        /* 8 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* 9 */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* A */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* B */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,

        /* C */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* D */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* E */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
        /* F */ -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1
    };

    const unsigned char * pSrc = (const unsigned char *)sSrc.c_str();
    const int SRC_LEN = sSrc.length();
    const unsigned char * const SRC_END = pSrc + SRC_LEN;
    const unsigned char * const SRC_LAST_DEC = SRC_END - 2;   // last decodable '%' 

    char * const pStart = new char[SRC_LEN];
    char * pEnd = pStart;

    while (pSrc < SRC_LAST_DEC)
    {
        if (*pSrc == '%')
        {
            char dec1, dec2;
            if (-1 != (dec1 = HEX2DEC[*(pSrc + 1)])
                && -1 != (dec2 = HEX2DEC[*(pSrc + 2)]))
            {
                *pEnd++ = (dec1 << 4) + dec2;
                pSrc += 3;
                continue;
            }
        }

        *pEnd++ = *pSrc++;
    }

    // the last 2- chars
    while (pSrc < SRC_END)
        *pEnd++ = *pSrc++;

    std::string sResult(pStart, pEnd);
    delete [] pStart;
    return sResult;
}

Ofcourse I call url_decode, but I get a string..( so I hope now my problem is more clear.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(2

梦里人 2024-12-09 01:22:38

以下是我正在修改的解决方案:

std::string wrong("Новая папка");
std::wstring correct( (wchar_t*)wrong.data() );

根据 http:// /www.cplusplus.com/reference/string/string/data/ data() 成员函数应该为我们提供原始 char* 并且简单地转换为 (wchar_t*) 应该会导致它粘贴 00 和 00一起做0000,正如您在示例中所描述的那样。

我个人不喜欢这样的选角,但这就是我迄今为止想到的全部。

编辑 -
您使用哪个库?它是否带有其他功能来逆转它所做的事情?

如果它很受欢迎,肯定其他人以前也遇到过这个问题。他们是怎么解决的呢?

编辑2-
这是一种令人厌恶的方式,使用 malloc,一些假设原始字符串中不会有任何半代码点,以及另一个可怕的强制转换。 :(

std::string wrong("Новая папка");
wchar_t *lesswrong = (wchar_t*) malloc (wrong.size()/sizeof(wchar_t) + sizeof(wchar_t));
lesswrong = (wchar_t*)wrong.data();
lesswrong[wrong.size()] = '\0';
std::wstring correct( lesswrong );

这不可能是正确的。即使它有效,它也很难看。

编辑 3 -
就像 Kerrick sadi 一样,这是一种更好的方法。

std::string wrong("Новая папка");
std::wstring correct( (wchar_t*)wrong.data(), wrong.size()/2 );

Here is what I am tinkering around with for a solution to your issue:

std::string wrong("Новая папка");
std::wstring correct( (wchar_t*)wrong.data() );

According to http://www.cplusplus.com/reference/string/string/data/ the data() member function should give us the raw char* and simply casting to a (wchar_t*) should cause it to stick the 00 and 00 together to make 0000, as you describe in you example.

I personally don't like casting like this, but this is all I have come up with so far.

Edit -
Which library are you using? Does it come with some other function to reverse what it has done?

If it is popular surely someone else has had this issue before. How did They solve it?

Edit 2 -
Here is a disgusting way, using malloc, some assumptions that there won't be any half code-points in the original string, and another terrible cast. :(

std::string wrong("Новая папка");
wchar_t *lesswrong = (wchar_t*) malloc (wrong.size()/sizeof(wchar_t) + sizeof(wchar_t));
lesswrong = (wchar_t*)wrong.data();
lesswrong[wrong.size()] = '\0';
std::wstring correct( lesswrong );

There is no way this can be correct. Even if it works it is so ugly.

Edit 3 -
Like Kerrick sadi, this is a better way to do it.

std::string wrong("Новая папка");
std::wstring correct( (wchar_t*)wrong.data(), wrong.size()/2 );
清风疏影 2024-12-09 01:22:38

如果我理解正确的话,您有一个包含 UTF-16 编码字符串的 std::string 对象,并且您希望将其转换为 std: :wstring 而不改变编码。如果我是正确的,那么您不必进行编码转换,也不必进行表示转换,而只需进行存储转换。

您还认为该字符串可能被错误地编码为 UTF-8。但是,UTF-8 是一种可变长度编码,但错误解释的数据的长度(Новая папка 为 22 个字符长)恰好是原始数据的长度(Новая папка 为 11 个字符长)。这就是为什么我怀疑这可能只是存储错误而不是编码错误的情况。

以下代码执行此操作:

std::wstring convert_utf16_string_to_wstring(const std::string& input) {
    assert((input.size() & 1) == 0);
    size_t len = input.size() / 2;
    std::wstring output;
    output.resize(len);

    for (size_t i = 0; i < len; ++i) {
        unsigned char chr1 = (unsigned char)input[2 * i];
        unsigned char chr2 = (unsigned char)input[2 * i + 1];

        // Note: this line suppose that you use `UTF-16-BE` both for
        // the std::string and the std::wstring. You'll have to swap
        // chr1 & chr2 if this is not the case.
        unsigned short val = (chr2 << 8)|(chr1);
        output[i] = (wchar_t)(val);
    }

    return output;
}

如果您知道在所有平台上,您的目标 sizeof(wchar_t) 等于 2(对于 64 位程序的 Mac OS,情况并非如此,例如,其中 sizeof (wchar_t) 等于 4),那么您可以使用简单的转换:

std::wstring convert_utf16_string_to_wstring(const std::string& input) {
    assert(sizeof(wchar_t) == 2); // A static assert would be better here
    assert((input.size() & 1) == 0);
    return input.empty()
        ? std::wstring()
        : std::wstring((wchar_t*)input[0], input.size() / 2);
}

If I understand you correctly, you have a std::string object that contains an UTF-16 encoded string, and you want to convert it to a std::wstring without changing the encoding. If I'm correct, then, you don't have to do conversion of encoding, nor of the representation but only of the storage.

You also think that the string may have incorrectly be encoded into UTF-8. However, UTF-8 is a variable length encoding, but the length of your incorrectly interpreted data (Новая папка is 22 characters long) is exactly twice the length of your original data (Новая папка is 11 characters long). This is why I suspect that this may be just a case of wrong storage and not wrong encoding.

The following code does that:

std::wstring convert_utf16_string_to_wstring(const std::string& input) {
    assert((input.size() & 1) == 0);
    size_t len = input.size() / 2;
    std::wstring output;
    output.resize(len);

    for (size_t i = 0; i < len; ++i) {
        unsigned char chr1 = (unsigned char)input[2 * i];
        unsigned char chr2 = (unsigned char)input[2 * i + 1];

        // Note: this line suppose that you use `UTF-16-BE` both for
        // the std::string and the std::wstring. You'll have to swap
        // chr1 & chr2 if this is not the case.
        unsigned short val = (chr2 << 8)|(chr1);
        output[i] = (wchar_t)(val);
    }

    return output;
}

If you know that on all the platform you target sizeof(wchar_t) equal 2 (this is not the case one Mac OS for 64-bit programs for exemple where sizeof(wchar_t) equals 4), then you can use a simple cast:

std::wstring convert_utf16_string_to_wstring(const std::string& input) {
    assert(sizeof(wchar_t) == 2); // A static assert would be better here
    assert((input.size() & 1) == 0);
    return input.empty()
        ? std::wstring()
        : std::wstring((wchar_t*)input[0], input.size() / 2);
}
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文