如何使用尴尬将替代位置替换

发布于 2025-01-28 05:56:50 字数 2221 浏览 1 评论 0原文

我有一个查找文件，用于搜索文件_2中的可用记录，如果存在此类记录，请用＃替换这些记录。目前，我的代码用＃代替了整个记录，但我需要部分替换它。我想用＃替换字符串的每两个字符。我该怎么办？您的帮助将不胜感激。谢谢

代码

awk ' NR==FNR {
s = $0;
gsub("[A-Za-z0-9]","#");
a[s] = $0;
next
}

{
if match($0, ">[^<]+"))
{
str = substr($0, RSTART+1, RLENGTH-1)
if (str in a )
{
$0 = substr($0, 1, RSTART) a[str] substr($0, RSTART+RLENGTH)
}
}
lines[FNR]=$0
}

END {for (i=1;i<=FNR;i++)
{
for (str in a )
{
regex = "\\<" str "\\>"
gsub(regex,a[str],lines[I])
}
}' lookup file_1 > file_2

猫查找

CDX98XSD
@vanti Finserv Co.
11:11 - Capital
MS&CO(NY)
MS&CO(NY)
MS&CO(NY)

cat file_1

<html>
<body>
<hr><br><>span class="table">Records</span><table>
<tr class="data">
<td>@vanti Finserv Co.</td>
<td>11:11 - Capital</td>
<td>MS&CO(NY)</td>
<td>New York</td>
<td>CDX98XSD</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="data">
<td>@vanti Finserv Co.</td>
<td></td>
<td>MS&CO(NY)</td>
<td>2</td>
<td>2</td>
<td>MS&CO(NY)</td>
<td>MS&CO(NY)</td>
<td></td>
</table>
</body>
</html>

预期输出

<html>
<body>
<hr><br><>span class="table">Records</span><table>
<tr class="data">
<td>@##n## F##s##v C##</td>
<td>1##11 - C##I##l</td>
<td>M##C##N##</td>
<td>New York</td>
<td>C##9##S#</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="data">
<td>@##n## F##s##v C##</td>
<td></td>
<td>M##C##N##</td>
<td>2</td>
<td>2</td>
<td>M##C##N##</td>
<td>M##C##N##</td>
<td></td>
</table>
</body>
</html>

原文

I have a lookup file that I use to search the available records in file_2 and if such records are present then replace those records with #. Currently my code is substituting the entire record with # but I need to partially substitute it.
I want to replace every two characters of the string with #. How can I do so? Your help will be much appreciated. Thanks

code

awk ' NR==FNR {
s = $0;
gsub("[A-Za-z0-9]","#");
a[s] = $0;
next
}

{
if match($0, ">[^<]+"))
{
str = substr($0, RSTART+1, RLENGTH-1)
if (str in a )
{
$0 = substr($0, 1, RSTART) a[str] substr($0, RSTART+RLENGTH)
}
}
lines[FNR]=$0
}

END {for (i=1;i<=FNR;i++)
{
for (str in a )
{
regex = "\\<" str "\\>"
gsub(regex,a[str],lines[I])
}
}' lookup file_1 > file_2

cat lookup

CDX98XSD
@vanti Finserv Co.
11:11 - Capital
MS&CO(NY)
MS&CO(NY)
MS&CO(NY)

cat file_1

<html>
<body>
<hr><br><>span class="table">Records</span><table>
<tr class="data">
<td>@vanti Finserv Co.</td>
<td>11:11 - Capital</td>
<td>MS&CO(NY)</td>
<td>New York</td>
<td>CDX98XSD</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="data">
<td>@vanti Finserv Co.</td>
<td></td>
<td>MS&CO(NY)</td>
<td>2</td>
<td>2</td>
<td>MS&CO(NY)</td>
<td>MS&CO(NY)</td>
<td></td>
</table>
</body>
</html>

expected output

<html>
<body>
<hr><br><>span class="table">Records</span><table>
<tr class="data">
<td>@##n## F##s##v C##</td>
<td>1##11 - C##I##l</td>
<td>M##C##N##</td>
<td>New York</td>
<td>C##9##S#</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="data">
<td>@##n## F##s##v C##</td>
<td></td>
<td>M##C##N##</td>
<td>2</td>
<td>2</td>
<td>M##C##N##</td>
<td>M##C##N##</td>
<td></td>
</table>
</body>
</html>

分享到QQ

分享到微博

如果你对这篇内容有疑问，欢迎到本站社区发帖提问参与讨论，获取更多帮助，或者扫码二维码加入 Web 技术交流群。

发布评论

需要登录才能够评论，你可以免费注册一个本站的账号。

紫竹語嫣☆ 2025-02-04 05:56:50

假设/理解：

查找中的重复条目可以被忽略（即，
对于查找>查找中的每个白空间划定的字符串，我们不会替换替换该字符串nth/（n+1）带有＃的字符（其中n = 2,5,8,11,14,17,20，....
） 查找字符串11:11- Capital正确的替换字符串是1 ## 1＃-C ## i ## l（与OP的相对1 ## 11 -C ## i ## l）

在输入文件中添加了以下行（基于OP的注释）：

# file: lookup
READ 1234
READ
READ NOW

# file: file_1
<td>READ 1234 stuff READ</td>
<td>READ READ NOW</td>
<td>READ NOW READ 9999 York</td>

一个awk构想：

awk '
FNR==NR { if ($0 in lookups)                           # if duplicate then ...
             next                                      # ignore
          lookups[$0]=$0
          for (i=1;i<=NF;i++) {                        # loop through list of white space delimited fields
              oldstr=$i
              newstr=""
              while (oldstr) {                         # while oldstr != ""
                    len=length(oldstr)
                    # keep 1st char; replace 2nd/3rd chars if length > 1/2, respectively
                    newstr=newstr substr(oldstr,1,1) substr("##",1,len-1)
                    oldstr=substr(oldstr,4)            # strip off first 3 characters
              }
              ndx=index(lookups[$0],$i)                # locate position of $i in current line
              # replace $i with newstr
              lookups[$0]=substr(lookups[$0],1,ndx-1) newstr substr(lookups[$0],ndx+length($i))
          }
          next
        }

        { match_found=1
          while (match_found) {

                match_found=0
                ndx=99999999
                len=0
                n=99999999

                # find earliest and longest match

                for (i in lookups) {

                    curr_len=length(lookups[i])
                    curr_ndx=index($0,i)

                    if (curr_ndx > 0) {
                       match_found=1

                       if (curr_ndx < ndx || (curr_ndx == ndx && curr_len > len)) {
                          ndx=curr_ndx
                          len=curr_len
                          n=i
                       }
                    } # if (curr_ndx > 0)
                } # for (i in lookup)

                if (match_found)
                   $0=substr($0,1,ndx-1) lookups[n] substr($0,ndx+len)

          } # while ( match_found )
          print
        }

 # uncomment following block to display contents of lookups[]
 #END    { print "############ lookups[]"
 #         for (i in lookups)
 #             print i " => " lookups[i]
 #         print "############"
 #       }
' lookup file_1 > file_2

此生成：

$ cat file_2
<html>
<body>
<hr><br><>span class="table">Records</span><table>
<tr class="data">
<td>@##n## F##s##v C##</td>
<td>1##1# - C##i##l</td>
<td>M##C##N##</td>
<td>New York</td>
<td>C##9##S#</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="data">
<td>@##n## F##s##v C##</td>
<td></td>
<td>M##C##N##</td>
<td>2</td>
<td>2</td>
<td>M##C##N##</td>
<td>M##C##N##</td>
<td></td>
</table>
</body>
</html>
<td>R##D 1##4 stuff R##D</td>
<td>R##D R##D N##</td>
<td>R##D N## R##D 9999 York</td>

专注于差异：

$ diff file_1 file_2
5,7c5,7
< <td>@vanti Finserv Co.</td>
< <td>11:11 - Capital</td>
< <td>MS&CO(NY)</td>
---
> <td>@##n## F##s##v C##</td>
> <td>1##1# - C##i##l</td>
> <td>M##C##N##</td>
9c9
< <td>CDX98XSD</td>
---
> <td>C##9##S#</td>
16c16
< <td>@vanti Finserv Co.</td>
---
> <td>@##n## F##s##v C##</td>
18c18
< <td>MS&CO(NY)</td>
---
> <td>M##C##N##</td>
21,22c21,22
< <td>MS&CO(NY)</td>
< <td>MS&CO(NY)</td>
---
> <td>M##C##N##</td>
> <td>M##C##N##</td>
27,29c27,29
< <td>READ 1234 stuff READ</td>
< <td>READ READ NOW</td>
< <td>READ NOW READ 9999 York</td>
---
> <td>R##D 1##4 stuff R##D</td>
> <td>R##D R##D N##</td>
> <td>R##D N## R##D 9999 York</td>

结束{...} block生成：

############ lookups[]
READ NOW => R##D N##
READ => R##D
MS&CO(NY) => M##C##N##
READ 1234 => R##D 1##4
CDX98XSD => C##9##S#
@vanti Finserv Co. => @##n## F##s##v C##
11:11 - Capital => 1##1# - C##i##l
############

Assumptions/Understandings:

duplicate entries in lookup can be ignored (ie, we don't treat duplicate occurrences differently)
for each white space delimited string in lookup we want to replace the nth/(n+1)th characters with # (where n = 2,5,8,11,14,17,20,....)
for the lookup string 11:11 - Capital the correct replacement string is 1##1# - C##i##l (as opposed to OP's 1##11 - C##i##l)

Added following lines to input files (based on comment from OP):

# file: lookup
READ 1234
READ
READ NOW

# file: file_1
<td>READ 1234 stuff READ</td>
<td>READ READ NOW</td>
<td>READ NOW READ 9999 York</td>

One awk idea:

awk '
FNR==NR { if ($0 in lookups)                           # if duplicate then ...
             next                                      # ignore
          lookups[$0]=$0
          for (i=1;i<=NF;i++) {                        # loop through list of white space delimited fields
              oldstr=$i
              newstr=""
              while (oldstr) {                         # while oldstr != ""
                    len=length(oldstr)
                    # keep 1st char; replace 2nd/3rd chars if length > 1/2, respectively
                    newstr=newstr substr(oldstr,1,1) substr("##",1,len-1)
                    oldstr=substr(oldstr,4)            # strip off first 3 characters
              }
              ndx=index(lookups[$0],$i)                # locate position of $i in current line
              # replace $i with newstr
              lookups[$0]=substr(lookups[$0],1,ndx-1) newstr substr(lookups[$0],ndx+length($i))
          }
          next
        }

        { match_found=1
          while (match_found) {

                match_found=0
                ndx=99999999
                len=0
                n=99999999

                # find earliest and longest match

                for (i in lookups) {

                    curr_len=length(lookups[i])
                    curr_ndx=index($0,i)

                    if (curr_ndx > 0) {
                       match_found=1

                       if (curr_ndx < ndx || (curr_ndx == ndx && curr_len > len)) {
                          ndx=curr_ndx
                          len=curr_len
                          n=i
                       }
                    } # if (curr_ndx > 0)
                } # for (i in lookup)

                if (match_found)
                   $0=substr($0,1,ndx-1) lookups[n] substr($0,ndx+len)

          } # while ( match_found )
          print
        }

 # uncomment following block to display contents of lookups[]
 #END    { print "############ lookups[]"
 #         for (i in lookups)
 #             print i " => " lookups[i]
 #         print "############"
 #       }
' lookup file_1 > file_2

This generates:

$ cat file_2
<html>
<body>
<hr><br><>span class="table">Records</span><table>
<tr class="data">
<td>@##n## F##s##v C##</td>
<td>1##1# - C##i##l</td>
<td>M##C##N##</td>
<td>New York</td>
<td>C##9##S#</td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr class="data">
<td>@##n## F##s##v C##</td>
<td></td>
<td>M##C##N##</td>
<td>2</td>
<td>2</td>
<td>M##C##N##</td>
<td>M##C##N##</td>
<td></td>
</table>
</body>
</html>
<td>R##D 1##4 stuff R##D</td>
<td>R##D R##D N##</td>
<td>R##D N## R##D 9999 York</td>

Focusing on just the differences:

$ diff file_1 file_2
5,7c5,7
< <td>@vanti Finserv Co.</td>
< <td>11:11 - Capital</td>
< <td>MS&CO(NY)</td>
---
> <td>@##n## F##s##v C##</td>
> <td>1##1# - C##i##l</td>
> <td>M##C##N##</td>
9c9
< <td>CDX98XSD</td>
---
> <td>C##9##S#</td>
16c16
< <td>@vanti Finserv Co.</td>
---
> <td>@##n## F##s##v C##</td>
18c18
< <td>MS&CO(NY)</td>
---
> <td>M##C##N##</td>
21,22c21,22
< <td>MS&CO(NY)</td>
< <td>MS&CO(NY)</td>
---
> <td>M##C##N##</td>
> <td>M##C##N##</td>
27,29c27,29
< <td>READ 1234 stuff READ</td>
< <td>READ READ NOW</td>
< <td>READ NOW READ 9999 York</td>
---
> <td>R##D 1##4 stuff R##D</td>
> <td>R##D R##D N##</td>
> <td>R##D N## R##D 9999 York</td>

Uncommenting the END{...} block generates:

############ lookups[]
READ NOW => R##D N##
READ => R##D
MS&CO(NY) => M##C##N##
READ 1234 => R##D 1##4
CDX98XSD => C##9##S#
@vanti Finserv Co. => @##n## F##s##v C##
11:11 - Capital => 1##1# - C##i##l
############

回复收藏 0 原文

~没有更多了~