从字符串中提取可单击的单词，并包括标点符号

发布于 2025-01-25 03:22:59 字数 7688 浏览 2 评论 0原文

我有一个句子，还有一系列可单击的单词。阵列不包括标点符号。

这是句子：

进入后备箱中，我们放了两个杆子，一罐蠕虫，一袋三明治和一袋水。我父亲说：“我们正在旅行。” “到一个秘密的地方。我们会赶上空气！我们将抓住微风！”

这是可点击单词的结构。这是一个数组，包含句子中单词开始和结束的索引。此数组不包含句子中

标点符号的标点符号。

"tokens": [
            {
              "position": [
                0,
                4
              ],
              "value": "into"
            },
            {
              "position": [
                5,
                8
              ],
              "value": "the"
            },
            {
              "position": [
                9,
                14
              ],
              "value": "trunk"
            },
            {
              "position": [
                15,
                17
              ],
              "value": "we"
            },
            {
              "position": [
                18,
                21
              ],
              "value": "put"
            },
            {
              "position": [
                22,
                25
              ],
              "value": "two"
            },
            {
              "position": [
                26,
                31
              ],
              "value": "poles"
            },
            {
              "position": [
                32,
                35
              ],
              "value": "and"
            },
            {
              "position": [
                36,
                39
              ],
              "value": "the"
            },
            {
              "position": [
                40,
                43
              ],
              "value": "can"
            },
            {
              "position": [
                44,
                46
              ],
              "value": "of"
            },
            {
              "position": [
                47,
                52
              ],
              "value": "worms"
            },
            {
              "position": [
                53,
                56
              ],
              "value": "and"
            },
            {
              "position": [
                57,
                58
              ],
              "value": "a"
            },
            {
              "position": [
                59,
                63
              ],
              "value": "sack"
            },
            {
              "position": [
                64,
                66
              ],
              "value": "of"
            },
            {
              "position": [
                67,
                77
              ],
              "value": "sandwiches"
            },
            {
              "position": [
                78,
                81
              ],
              "value": "and"
            },
            {
              "position": [
                82,
                83
              ],
              "value": "a"
            },
            {
              "position": [
                84,
                91
              ],
              "value": "thermos"
            },
            {
              "position": [
                92,
                94
              ],
              "value": "of"
            },
            {
              "position": [
                95,
                100
              ],
              "value": "water"
            },
            {
              "position": [
                103,
                108
              ],
              "value": "we're"
            },
            {
              "position": [
                109,
                114
              ],
              "value": "going"
            },
            {
              "position": [
                115,
                117
              ],
              "value": "on"
            },
            {
              "position": [
                118,
                119
              ],
              "value": "a"
            },
            {
              "position": [
                120,
                127
              ],
              "value": "journey"
            },
            {
              "position": [
                130,
                132
              ],
              "value": "my"
            },
            {
              "position": [
                133,
                139
              ],
              "value": "father"
            },
            {
              "position": [
                140,
                144
              ],
              "value": "said"
            },
            {
              "position": [
                147,
                149
              ],
              "value": "to"
            },
            {
              "position": [
                150,
                151
              ],
              "value": "a"
            },
            {
              "position": [
                152,
                158
              ],
              "value": "secret"
            },
            {
              "position": [
                159,
                164
              ],
              "value": "place"
            },
            {
              "position": [
                166,
                171
              ],
              "value": "we'll"
            },
            {
              "position": [
                172,
                177
              ],
              "value": "catch"
            },
            {
              "position": [
                178,
                181
              ],
              "value": "the"
            },
            {
              "position": [
                182,
                185
              ],
              "value": "air"
            },
            {
              "position": [
                187,
                192
              ],
              "value": "we'll"
            },
            {
              "position": [
                193,
                198
              ],
              "value": "catch"
            },
            {
              "position": [
                199,
                202
              ],
              "value": "the"
            },
            {
              "position": [
                203,
                209
              ],
              "value": "breeze"
            }
          ]
        },

这是我的代码在这里获得可单击的单词，

 const getWordsFromTokens = tokens.reduce((words, token)=>{
   let start = token.position[0]; //Start is the first character of the token value in the sentence
   let end = token.position[1]; // end is the last character of the token value in the sentence

   let differenceBetweenLastPositionAndFirst = end+(end-start); 
   
    /* You get punctuationMarks or any characters not in the Tokens by getting the string between 
        the end and difference between the end and start
    */
   let punctuationMarks = content.substring(end, (differenceBetweenLastPositionAndFirst)); 
   
   console.log(punctuationMarks);

   words.push( content.substring(start, end)+punctuationMarks); //concat with any space of pucntuation mark after the word.
   return words; //<- return this to be used in next round of reduce untill all words are
  },[]);

这是我在这里呈现文本的方式

return (
    <div>
      <p> {
        getWordsFromTokens.map((word, index)=>{
         return <a href={'/word/' + word} > {word}</a>
        })
      }
      </p>
    </div>
  )

，当我呈现文本时，它看起来不像原始文本。我可能做错了什么？

这是最终结果看起来像

进入tr tr的干线，我们在p tw两个po杆和w蠕虫的ca罐中，以及s三明治的aa袋，以及a和a aa and aa and aa thermotos w wat。我的父亲说：“我们正在进行AA旅程。”说。 “到AA秘密地方。我们将CATC抓住AI空气！ w我们将抓住Br Breeze！”

原文

I have a sentence and also an array of clickable words from the sentence. The array does not include the punctuation marks.

Here is the sentence:

Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. “We’re going on a journey,” my father said. “To a secret place. We’ll catch the air! We’ll catch the breeze!”

Here is the structure of the clickable words. It's an array containing the indexes of where within the sentence the word begins and ends. This Array does not contain the punctuation marks in the sentence

The punctuation marks are not clickable.

"tokens": [
            {
              "position": [
                0,
                4
              ],
              "value": "into"
            },
            {
              "position": [
                5,
                8
              ],
              "value": "the"
            },
            {
              "position": [
                9,
                14
              ],
              "value": "trunk"
            },
            {
              "position": [
                15,
                17
              ],
              "value": "we"
            },
            {
              "position": [
                18,
                21
              ],
              "value": "put"
            },
            {
              "position": [
                22,
                25
              ],
              "value": "two"
            },
            {
              "position": [
                26,
                31
              ],
              "value": "poles"
            },
            {
              "position": [
                32,
                35
              ],
              "value": "and"
            },
            {
              "position": [
                36,
                39
              ],
              "value": "the"
            },
            {
              "position": [
                40,
                43
              ],
              "value": "can"
            },
            {
              "position": [
                44,
                46
              ],
              "value": "of"
            },
            {
              "position": [
                47,
                52
              ],
              "value": "worms"
            },
            {
              "position": [
                53,
                56
              ],
              "value": "and"
            },
            {
              "position": [
                57,
                58
              ],
              "value": "a"
            },
            {
              "position": [
                59,
                63
              ],
              "value": "sack"
            },
            {
              "position": [
                64,
                66
              ],
              "value": "of"
            },
            {
              "position": [
                67,
                77
              ],
              "value": "sandwiches"
            },
            {
              "position": [
                78,
                81
              ],
              "value": "and"
            },
            {
              "position": [
                82,
                83
              ],
              "value": "a"
            },
            {
              "position": [
                84,
                91
              ],
              "value": "thermos"
            },
            {
              "position": [
                92,
                94
              ],
              "value": "of"
            },
            {
              "position": [
                95,
                100
              ],
              "value": "water"
            },
            {
              "position": [
                103,
                108
              ],
              "value": "we're"
            },
            {
              "position": [
                109,
                114
              ],
              "value": "going"
            },
            {
              "position": [
                115,
                117
              ],
              "value": "on"
            },
            {
              "position": [
                118,
                119
              ],
              "value": "a"
            },
            {
              "position": [
                120,
                127
              ],
              "value": "journey"
            },
            {
              "position": [
                130,
                132
              ],
              "value": "my"
            },
            {
              "position": [
                133,
                139
              ],
              "value": "father"
            },
            {
              "position": [
                140,
                144
              ],
              "value": "said"
            },
            {
              "position": [
                147,
                149
              ],
              "value": "to"
            },
            {
              "position": [
                150,
                151
              ],
              "value": "a"
            },
            {
              "position": [
                152,
                158
              ],
              "value": "secret"
            },
            {
              "position": [
                159,
                164
              ],
              "value": "place"
            },
            {
              "position": [
                166,
                171
              ],
              "value": "we'll"
            },
            {
              "position": [
                172,
                177
              ],
              "value": "catch"
            },
            {
              "position": [
                178,
                181
              ],
              "value": "the"
            },
            {
              "position": [
                182,
                185
              ],
              "value": "air"
            },
            {
              "position": [
                187,
                192
              ],
              "value": "we'll"
            },
            {
              "position": [
                193,
                198
              ],
              "value": "catch"
            },
            {
              "position": [
                199,
                202
              ],
              "value": "the"
            },
            {
              "position": [
                203,
                209
              ],
              "value": "breeze"
            }
          ]
        },

Here is my code that gets the clickable words

 const getWordsFromTokens = tokens.reduce((words, token)=>{
   let start = token.position[0]; //Start is the first character of the token value in the sentence
   let end = token.position[1]; // end is the last character of the token value in the sentence

   let differenceBetweenLastPositionAndFirst = end+(end-start); 
   
    /* You get punctuationMarks or any characters not in the Tokens by getting the string between 
        the end and difference between the end and start
    */
   let punctuationMarks = content.substring(end, (differenceBetweenLastPositionAndFirst)); 
   
   console.log(punctuationMarks);

   words.push( content.substring(start, end)+punctuationMarks); //concat with any space of pucntuation mark after the word.
   return words; //<- return this to be used in next round of reduce untill all words are
  },[]);

Here is How I'm rendering the text

return (
    <div>
      <p> {
        getWordsFromTokens.map((word, index)=>{
         return <a href={'/word/' + word} > {word}</a>
        })
      }
      </p>
    </div>
  )

Here is my problem, When I render the text, it does not look exactly like the original text. What is it that I could be doing wrong?

Here is how the final Result looks like

Into the the tr trunk we p we p put tw two po poles and and th the ca can of of w worms and and a a sack of of s sandwiches and a the and a a thermos of wat of w water. “We We’re goin going on a on a a journey,” my f my f father said. said. “T To a a secret place place. We’ We’ll catc catch the the ai air! W We’ll catc catch the the br breeze!”

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

一场春暖 2025-02-01 03:22:59

这样的解决方案呢？我使用光标来跟踪句子内部的位置。

const tokens = [{
    "position": [
      0,
      4
    ],
    "value": "into"
  },
  {
    "position": [
      5,
      8
    ],
    "value": "the"
  },
  {
    "position": [
      9,
      14
    ],
    "value": "trunk"
  },
  {
    "position": [
      15,
      17
    ],
    "value": "we"
  },
  {
    "position": [
      18,
      21
    ],
    "value": "put"
  },
  {
    "position": [
      22,
      25
    ],
    "value": "two"
  },
  {
    "position": [
      26,
      31
    ],
    "value": "poles"
  },
  {
    "position": [
      32,
      35
    ],
    "value": "and"
  },
  {
    "position": [
      36,
      39
    ],
    "value": "the"
  },
  {
    "position": [
      40,
      43
    ],
    "value": "can"
  },
  {
    "position": [
      44,
      46
    ],
    "value": "of"
  },
  {
    "position": [
      47,
      52
    ],
    "value": "worms"
  },
  {
    "position": [
      53,
      56
    ],
    "value": "and"
  },
  {
    "position": [
      57,
      58
    ],
    "value": "a"
  },
  {
    "position": [
      59,
      63
    ],
    "value": "sack"
  },
  {
    "position": [
      64,
      66
    ],
    "value": "of"
  },
  {
    "position": [
      67,
      77
    ],
    "value": "sandwiches"
  },
  {
    "position": [
      78,
      81
    ],
    "value": "and"
  },
  {
    "position": [
      82,
      83
    ],
    "value": "a"
  },
  {
    "position": [
      84,
      91
    ],
    "value": "thermos"
  },
  {
    "position": [
      92,
      94
    ],
    "value": "of"
  },
  {
    "position": [
      95,
      100
    ],
    "value": "water"
  },
  {
    "position": [
      103,
      108
    ],
    "value": "we're"
  },
  {
    "position": [
      109,
      114
    ],
    "value": "going"
  },
  {
    "position": [
      115,
      117
    ],
    "value": "on"
  },
  {
    "position": [
      118,
      119
    ],
    "value": "a"
  },
  {
    "position": [
      120,
      127
    ],
    "value": "journey"
  },
  {
    "position": [
      130,
      132
    ],
    "value": "my"
  },
  {
    "position": [
      133,
      139
    ],
    "value": "father"
  },
  {
    "position": [
      140,
      144
    ],
    "value": "said"
  },
  {
    "position": [
      147,
      149
    ],
    "value": "to"
  },
  {
    "position": [
      150,
      151
    ],
    "value": "a"
  },
  {
    "position": [
      152,
      158
    ],
    "value": "secret"
  },
  {
    "position": [
      159,
      164
    ],
    "value": "place"
  },
  {
    "position": [
      166,
      171
    ],
    "value": "we'll"
  },
  {
    "position": [
      172,
      177
    ],
    "value": "catch"
  },
  {
    "position": [
      178,
      181
    ],
    "value": "the"
  },
  {
    "position": [
      182,
      185
    ],
    "value": "air"
  },
  {
    "position": [
      187,
      192
    ],
    "value": "we'll"
  },
  {
    "position": [
      193,
      198
    ],
    "value": "catch"
  },
  {
    "position": [
      199,
      202
    ],
    "value": "the"
  },
  {
    "position": [
      203,
      209
    ],
    "value": "breeze"
  }
];

const content = 'Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. “We’re going on a journey,” my father said. “To a secret place. We’ll catch the air! We’ll catch the breeze!"';

let cursorPosition = 0; // set a variable to track the position of cursor

const getWordsFromTokens = tokens.reduce((words, token) => {
  let tokenStart = token.position[0]; //Start is the first character of the token value in the sentence
  let tokenEnd = token.position[1]; // end is the last character of the token value in the sentence

  let notWordBeforeThisWord = content.substring(cursorPosition, tokenStart); // get the non-word characters (spaces, punctuation) before the current word

  let tokenValue = content.substring(tokenStart, tokenEnd);; // the word value

  words.push({
    type: 'non-word',
    value: notWordBeforeThisWord
  }, {
    type: 'word',
    value: tokenValue
  }); //concat with any space of pucntuation mark after the word.

  cursorPosition = tokenEnd; // update the cursor position

  return words; // return this to be used in next round of reduce untill all words are
}, []);

getWordsFromTokens.forEach(item => {
  const htmlToAppend = item.type === 'word' ?
    `<a href='/word/${item.value}'>${item.value}</a>` :
    item.value

  document.getElementById('new-sentence').innerHTML += htmlToAppend;
})

const endOfSentence = content.substring(cursorPosition); // get all carachters (if any) after the last token

document.getElementById('new-sentence').innerHTML = document.getElementById('new-sentence').innerHTML + endOfSentence;

<p id='new-sentence'></p>

What about a solution like this? I use a cursor to track the position inside the sentence.

const tokens = [{
    "position": [
      0,
      4
    ],
    "value": "into"
  },
  {
    "position": [
      5,
      8
    ],
    "value": "the"
  },
  {
    "position": [
      9,
      14
    ],
    "value": "trunk"
  },
  {
    "position": [
      15,
      17
    ],
    "value": "we"
  },
  {
    "position": [
      18,
      21
    ],
    "value": "put"
  },
  {
    "position": [
      22,
      25
    ],
    "value": "two"
  },
  {
    "position": [
      26,
      31
    ],
    "value": "poles"
  },
  {
    "position": [
      32,
      35
    ],
    "value": "and"
  },
  {
    "position": [
      36,
      39
    ],
    "value": "the"
  },
  {
    "position": [
      40,
      43
    ],
    "value": "can"
  },
  {
    "position": [
      44,
      46
    ],
    "value": "of"
  },
  {
    "position": [
      47,
      52
    ],
    "value": "worms"
  },
  {
    "position": [
      53,
      56
    ],
    "value": "and"
  },
  {
    "position": [
      57,
      58
    ],
    "value": "a"
  },
  {
    "position": [
      59,
      63
    ],
    "value": "sack"
  },
  {
    "position": [
      64,
      66
    ],
    "value": "of"
  },
  {
    "position": [
      67,
      77
    ],
    "value": "sandwiches"
  },
  {
    "position": [
      78,
      81
    ],
    "value": "and"
  },
  {
    "position": [
      82,
      83
    ],
    "value": "a"
  },
  {
    "position": [
      84,
      91
    ],
    "value": "thermos"
  },
  {
    "position": [
      92,
      94
    ],
    "value": "of"
  },
  {
    "position": [
      95,
      100
    ],
    "value": "water"
  },
  {
    "position": [
      103,
      108
    ],
    "value": "we're"
  },
  {
    "position": [
      109,
      114
    ],
    "value": "going"
  },
  {
    "position": [
      115,
      117
    ],
    "value": "on"
  },
  {
    "position": [
      118,
      119
    ],
    "value": "a"
  },
  {
    "position": [
      120,
      127
    ],
    "value": "journey"
  },
  {
    "position": [
      130,
      132
    ],
    "value": "my"
  },
  {
    "position": [
      133,
      139
    ],
    "value": "father"
  },
  {
    "position": [
      140,
      144
    ],
    "value": "said"
  },
  {
    "position": [
      147,
      149
    ],
    "value": "to"
  },
  {
    "position": [
      150,
      151
    ],
    "value": "a"
  },
  {
    "position": [
      152,
      158
    ],
    "value": "secret"
  },
  {
    "position": [
      159,
      164
    ],
    "value": "place"
  },
  {
    "position": [
      166,
      171
    ],
    "value": "we'll"
  },
  {
    "position": [
      172,
      177
    ],
    "value": "catch"
  },
  {
    "position": [
      178,
      181
    ],
    "value": "the"
  },
  {
    "position": [
      182,
      185
    ],
    "value": "air"
  },
  {
    "position": [
      187,
      192
    ],
    "value": "we'll"
  },
  {
    "position": [
      193,
      198
    ],
    "value": "catch"
  },
  {
    "position": [
      199,
      202
    ],
    "value": "the"
  },
  {
    "position": [
      203,
      209
    ],
    "value": "breeze"
  }
];

const content = 'Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. “We’re going on a journey,” my father said. “To a secret place. We’ll catch the air! We’ll catch the breeze!"';

let cursorPosition = 0; // set a variable to track the position of cursor

const getWordsFromTokens = tokens.reduce((words, token) => {
  let tokenStart = token.position[0]; //Start is the first character of the token value in the sentence
  let tokenEnd = token.position[1]; // end is the last character of the token value in the sentence

  let notWordBeforeThisWord = content.substring(cursorPosition, tokenStart); // get the non-word characters (spaces, punctuation) before the current word

  let tokenValue = content.substring(tokenStart, tokenEnd);; // the word value

  words.push({
    type: 'non-word',
    value: notWordBeforeThisWord
  }, {
    type: 'word',
    value: tokenValue
  }); //concat with any space of pucntuation mark after the word.

  cursorPosition = tokenEnd; // update the cursor position

  return words; // return this to be used in next round of reduce untill all words are
}, []);

getWordsFromTokens.forEach(item => {
  const htmlToAppend = item.type === 'word' ?
    `<a href='/word/${item.value}'>${item.value}</a>` :
    item.value

  document.getElementById('new-sentence').innerHTML += htmlToAppend;
})

const endOfSentence = content.substring(cursorPosition); // get all carachters (if any) after the last token

document.getElementById('new-sentence').innerHTML = document.getElementById('new-sentence').innerHTML + endOfSentence;

<p id='new-sentence'></p>

回复收藏 0 原文

执妄 2025-02-01 03:22:59

我认为使用REGEXP会使您的生活更轻松：

const content = `Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. "We're going on a journey," my father said. "To a secret place. We'll catch the air! We'll catch the breeze!`;
const result = content.match(/([\w'])+|([\.;,:-_?!"]+[\s"]*["]*)/gim);
console.log(result);
const punctuation = /[\.;,:\-_?!"]+/;

 function App() {
    return (
    <div>
      {result.map((w) =>
        punctuation.test(w) ? w : <a href={`/word/${w}`}>{w + '\n'}</a>
      )}
    </div>
  );
}

ReactDOM.render(<App/>, document.getElementById("root"))

<div id="root"></div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/react/16.6.3/umd/react.production.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/react-dom/16.6.3/umd/react-dom.production.min.js"></script>

I think that using RegExp would make your life easier:

const content = `Into the trunk we put two poles and the can of worms and a sack of sandwiches and a thermos of water. "We're going on a journey," my father said. "To a secret place. We'll catch the air! We'll catch the breeze!`;
const result = content.match(/([\w'])+|([\.;,:-_?!"]+[\s"]*["]*)/gim);
console.log(result);
const punctuation = /[\.;,:\-_?!"]+/;

 function App() {
    return (
    <div>
      {result.map((w) =>
        punctuation.test(w) ? w : <a href={`/word/${w}`}>{w + '\n'}</a>
      )}
    </div>
  );
}

ReactDOM.render(<App/>, document.getElementById("root"))

<div id="root"></div>
<script src="https://cdnjs.cloudflare.com/ajax/libs/react/16.6.3/umd/react.production.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/react-dom/16.6.3/umd/react-dom.production.min.js"></script>

回复收藏 0 原文

~没有更多了~