javascript 和带有 utf-16 代理项对的字符串操作

发布于 2024-11-27 01:09:40 字数 1115 浏览 2 评论 0原文

我正在开发一个 Twitter 应用程序，刚刚偶然进入了 utf-8(16) 的世界。似乎大多数 javascript 字符串函数都像我一样对代理对视而不见。我必须重新编码一些东西以使其具有广泛的字符识别能力。

我有这个函数可以将字符串解析为数组，同时保留代理对。然后我将重新编码几个函数来处理数组而不是字符串。

function sortSurrogates(str){
  var cp = [];                 // array to hold code points
  while(str.length){           // loop till we've done the whole string
    if(/[\uD800-\uDFFF]/.test(str.substr(0,1))){ // test the first character
                               // High surrogate found low surrogate follows
      cp.push(str.substr(0,2)); // push the two onto array
      str = str.substr(2);     // clip the two off the string
    }else{                     // else BMP code point
      cp.push(str.substr(0,1)); // push one onto array
      str = str.substr(1);     // clip one from string 
    }
  }                            // loop
  return cp;                   // return the array
}

我的问题是，我缺少一些更简单的东西吗？我看到很多人重申 javascript 本身就处理 utf-16，但我的测试让我相信，这可能是数据格式，但函数还不知道。我错过了一些简单的事情吗？

编辑：为了帮助说明这个问题：

var a = "0123456789"; // U+0030 - U+0039 2 bytes each
var b = "

原文

I'm working on a twitter app and just stumbled into the world of utf-8(16). It seems the majority of javascript string functions are as blind to surrogate pairs as I was. I've got to recode some stuff to make it wide character aware.

I've got this function to parse strings into arrays while preserving the surrogate pairs. Then I'll recode several functions to deal with the arrays rather than strings.

function sortSurrogates(str){
  var cp = [];                 // array to hold code points
  while(str.length){           // loop till we've done the whole string
    if(/[\uD800-\uDFFF]/.test(str.substr(0,1))){ // test the first character
                               // High surrogate found low surrogate follows
      cp.push(str.substr(0,2)); // push the two onto array
      str = str.substr(2);     // clip the two off the string
    }else{                     // else BMP code point
      cp.push(str.substr(0,1)); // push one onto array
      str = str.substr(1);     // clip one from string 
    }
  }                            // loop
  return cp;                   // return the array
}

My question is, is there something simpler I'm missing? I see so many people reiterating that javascript deals with utf-16 natively, yet my testing leads me to believe, that may be the data format, but the functions don't know it yet. Am I missing something simple?

EDIT:
To help illustrate the issue:

var a = "0123456789"; // U+0030 - U+0039 2 bytes each
var b = "????????????????????????????????????????"; // U+1D7D8 - U+1D7E1 4 bytes each
alert(a.length); // javascript shows 10
alert(b.length); // javascript shows 20

Twitter sees and counts both of those as being 10 characters long.

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

生寂 2024-12-04 01:09:40

Javascript 内部使用 UCS-2，而不是 UTF-16。因此，在 Javascript 中处理 Unicode 非常困难，我不建议尝试这样做。

至于 Twitter 的做法，您似乎是在说它是按代码点进行理智计数，而不是按代码单元进行疯狂计数。

除非别无选择，否则应该使用实际支持 Unicode 的编程语言，并且该语言具有代码点接口，而不是代码单元接口。正如您所发现的，Javascript 还不够好。

它有UCS-2 诅咒，这甚至比UTF-16 诅咒还要糟糕，后者已经够糟糕了。我在 OSCON 演讲中谈论了所有这些，

回复收藏 0 原文

别低头，皇冠会掉 2024-12-04 01:09:40

我已经拼凑出了 Unicode 字符串处理对象的起点。它创建一个名为 UnicodeString() 的函数，该函数接受 JavaScript 字符串或表示 Unicode 代码点的整数数组，并提供 length 和 codePoints 属性以及 toString() 和 slice() 方法。添加正则表达式支持会非常复杂，但是像 indexOf() 和 split() （没有正则表达式支持）这样的东西应该很容易实现。

var UnicodeString = (function() {

    function surrogatePairToCodePoint(charCode1, charCode2) {

        return ((charCode1 & 0x3FF) << 10) + (charCode2 & 0x3FF) + 0x10000;

    }
    function stringToCodePointArray(str) {

        var codePoints = [], i = 0, charCode;

        while (i < str.length) {

            charCode = str.charCodeAt(i);

            if ((charCode & 0xF800) == 0xD800) {

                codePoints.push(surrogatePairToCodePoint(charCode, str.charCodeAt(++i)));

            } else {

                codePoints.push(charCode);

            }

            ++i;

        }

        return codePoints;

    }
    function codePointArrayToString(codePoints) {

        var stringParts = [];

        for (var i = 0, len = codePoints.length, codePoint, offset, codePointCharCodes; i < len; ++i) {

            codePoint = codePoints[i];

            if (codePoint > 0xFFFF) {

                offset = codePoint - 0x10000;

                codePointCharCodes = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];

            } else {

                codePointCharCodes = [codePoint];

            }

            stringParts.push(String.fromCharCode.apply(String, codePointCharCodes));

        }

        return stringParts.join("");

    }
    function UnicodeString(arg) {

        if (this instanceof UnicodeString) {

            this.codePoints = (typeof arg == "string") ? stringToCodePointArray(arg) : arg;

            this.length = this.codePoints.length;

        } else {

            return new UnicodeString(arg);

        }

    }
    UnicodeString.prototype = {

        slice: function(start, end) {

            return new UnicodeString(this.codePoints.slice(start, end));

        },
        toString: function() {

            return codePointArrayToString(this.codePoints);

        }

    };
    return UnicodeString;

})();
var ustr = UnicodeString("f

I've knocked together the starting point for a Unicode string handling object. It creates a function called UnicodeString() that accepts either a JavaScript string or an array of integers representing Unicode code points and provides length and codePoints properties and toString() and slice() methods. Adding regular expression support would be very complicated, but things like indexOf() and split() (without regex support) should be pretty easy to implement.

var UnicodeString = (function() {
    function surrogatePairToCodePoint(charCode1, charCode2) {
        return ((charCode1 & 0x3FF) << 10) + (charCode2 & 0x3FF) + 0x10000;
    }

    function stringToCodePointArray(str) {
        var codePoints = [], i = 0, charCode;
        while (i < str.length) {
            charCode = str.charCodeAt(i);
            if ((charCode & 0xF800) == 0xD800) {
                codePoints.push(surrogatePairToCodePoint(charCode, str.charCodeAt(++i)));
            } else {
                codePoints.push(charCode);
            }
            ++i;
        }
        return codePoints;
    }

    function codePointArrayToString(codePoints) {
        var stringParts = [];
        for (var i = 0, len = codePoints.length, codePoint, offset, codePointCharCodes; i < len; ++i) {
            codePoint = codePoints[i];
            if (codePoint > 0xFFFF) {
                offset = codePoint - 0x10000;
                codePointCharCodes = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
            } else {
                codePointCharCodes = [codePoint];
            }
            stringParts.push(String.fromCharCode.apply(String, codePointCharCodes));
        }
        return stringParts.join("");
    }

    function UnicodeString(arg) {
        if (this instanceof UnicodeString) {
            this.codePoints = (typeof arg == "string") ? stringToCodePointArray(arg) : arg;
            this.length = this.codePoints.length;
        } else {
            return new UnicodeString(arg);
        }
    }

    UnicodeString.prototype = {
        slice: function(start, end) {
            return new UnicodeString(this.codePoints.slice(start, end));
        },

        toString: function() {
            return codePointArrayToString(this.codePoints);
        }
    };


    return UnicodeString;
})();

var ustr = UnicodeString("f????????bar");
document.getElementById("output").textContent = "String: '" + ustr + "', length: " + ustr.length + ", slice(2, 4): " + ustr.slice(2, 4);

<div id="output"></div>

回复收藏 0 原文

刘备忘录 2024-12-04 01:09:40

以下是一些在处理 JavaScript 中的代理项对时可能会有所帮助的脚本：

ES6 Unicode 垫片ES3+ 添加了 ECMAScript 6 中的 String.fromCodePoint 和 String.prototype.codePointAt 方法。ES3/5 fromCharCode 和 charCodeAt 方法不考虑代理对，因此会给出错误的结果。
XRegExp 中与 \u{10FFFF} 的完整 21 位 Unicode 代码点匹配允许匹配 XRegExp 正则表达式中的任何单个代码点。

回复收藏 0 原文

岁吢 2024-12-04 01:09:40

Javascript 字符串迭代器可以为您提供实际字符而不是代理代码点：

>>> [..."0123456789"]

["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

>>> [..."

Javascript string iterators can give you the actual characters instead of the surrogate code points:

>>> [..."0123456789"]
["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
>>> [..."????????????????????????????????????????"]
["????", "????", "????", "????", "????", "????", "????", "????", "????", "????"]
>>> [..."0123456789"].length
10
>>> [..."????????????????????????????????????????"].length
10

回复收藏 0 原文

终止放荡 2024-12-04 01:09:40

这符合我正在寻找的内容。它需要更好地支持不同的字符串函数。当我添加它时，我将更新这个答案。

function wString(str){
  var T = this; //makes 'this' visible in functions
  T.cp = [];    //code point array
  T.length = 0; //length attribute
  T.wString = true; // (item.wString) tests for wString object

//member functions
  sortSurrogates = function(s){  //returns array of utf-16 code points
    var chrs = [];
    while(s.length){             // loop till we've done the whole string
      if(/[\uD800-\uDFFF]/.test(s.substr(0,1))){ // test the first character
                                 // High surrogate found low surrogate follows
        chrs.push(s.substr(0,2)); // push the two onto array
        s = s.substr(2);         // clip the two off the string
      }else{                     // else BMP code point
        chrs.push(s.substr(0,1)); // push one onto array
        s = s.substr(1);         // clip one from string 
      }
    }                            // loop
    return chrs;
  };
//end member functions

//prototype functions
  T.substr = function(start,len){
    if(len){
      return T.cp.slice(start,start+len).join('');
    }else{
      return T.cp.slice(start).join('');
    }
  };

  T.substring = function(start,end){
    return T.cp.slice(start,end).join('');
  };

  T.replace = function(target,str){
    //allow wStrings as parameters
    if(str.wString) str = str.cp.join('');
    if(target.wString) target = target.cp.join('');
    return T.toString().replace(target,str);
  };

  T.equals = function(s){
    if(!s.wString){
      s = sortSurrogates(s);
      T.cp = s;
    }else{
        T.cp = s.cp;
    }
    T.length = T.cp.length;
  };

  T.toString = function(){return T.cp.join('');};
//end prototype functions

  T.equals(str)
};

测试结果：

// plain string

var x = "0123456789";

alert(x);                    // 0123456789

alert(x.substr(4,5))         // 45678

alert(x.substring(2,4))      // 23

alert(x.replace("456","x")); // 0123x789

alert(x.length);             // 10
// wString object

x = new wString("

This is along the lines of what I was looking for. It needs better support for the different string functions. As I add to it I will update this answer.

function wString(str){
  var T = this; //makes 'this' visible in functions
  T.cp = [];    //code point array
  T.length = 0; //length attribute
  T.wString = true; // (item.wString) tests for wString object

//member functions
  sortSurrogates = function(s){  //returns array of utf-16 code points
    var chrs = [];
    while(s.length){             // loop till we've done the whole string
      if(/[\uD800-\uDFFF]/.test(s.substr(0,1))){ // test the first character
                                 // High surrogate found low surrogate follows
        chrs.push(s.substr(0,2)); // push the two onto array
        s = s.substr(2);         // clip the two off the string
      }else{                     // else BMP code point
        chrs.push(s.substr(0,1)); // push one onto array
        s = s.substr(1);         // clip one from string 
      }
    }                            // loop
    return chrs;
  };
//end member functions

//prototype functions
  T.substr = function(start,len){
    if(len){
      return T.cp.slice(start,start+len).join('');
    }else{
      return T.cp.slice(start).join('');
    }
  };

  T.substring = function(start,end){
    return T.cp.slice(start,end).join('');
  };

  T.replace = function(target,str){
    //allow wStrings as parameters
    if(str.wString) str = str.cp.join('');
    if(target.wString) target = target.cp.join('');
    return T.toString().replace(target,str);
  };

  T.equals = function(s){
    if(!s.wString){
      s = sortSurrogates(s);
      T.cp = s;
    }else{
        T.cp = s.cp;
    }
    T.length = T.cp.length;
  };

  T.toString = function(){return T.cp.join('');};
//end prototype functions

  T.equals(str)
};

Test results:

// plain string
var x = "0123456789";
alert(x);                    // 0123456789
alert(x.substr(4,5))         // 45678
alert(x.substring(2,4))      // 23
alert(x.replace("456","x")); // 0123x789
alert(x.length);             // 10

// wString object
x = new wString("????????????????????????????????????????");
alert(x);                    // ????????????????????????????????????????
alert(x.substr(4,5))         // ????????????????????
alert(x.substring(2,4))      // ????????
alert(x.replace("????????????","x")); // ????????????????x????????????
alert(x.length);             // 10

回复收藏 0 原文

~没有更多了~