在文档中查找信用卡号的正则表达式不起作用

发布于 2024-08-20 12:33:33 字数 3099 浏览 2 评论 0原文

我正在创建一个小型应用程序,它将打开一个Word文档,扫描它以查找信用卡号码(不同的模式),替换文本,保存并关闭文档。

我的代码相当简单:

using System;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using Word = Microsoft.Office.Interop.Word;

namespace ParseFilesAndRemoveRegExp
{
    class Program
    {
        static void Main(string[] args)
        {
            FileManagement m = new FileManagement();
            m.OpenSearchAndReplace();
        }
    }

    class FileManagement
    {
        Word.Application wordapp;

        public FileManagement()
        {
            try
            {
                wordapp = new Word.Application();
            }
            catch(Exception ex)
            {
                if (ex != null)
                {
                    string s = ex.ToString();
                }
            }
        }

        internal void OpenSearchAndReplace()
        {
            object nullobj = System.Reflection.Missing.Value;
            try
            { 
                object filename = @"c:\\temp\\document.docx";
                object replaceAll = Word.WdReplace.wdReplaceAll;

                object matchWildCards = true;
                object readOnly = false;
                object isVisible = false;

                Word.Document doc = wordapp.Documents.Open( ref filename, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, 
                                                            ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                                            ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                doc.Activate();
                wordapp.Selection.Find.ClearFormatting();

                //wordapp.Selection.Find.Text = "[0-9]{16}";
                wordapp.Selection.Find.Text = "\b(?:[0-9][ -]*?){13,16}\b";
                wordapp.Selection.Find.Replacement.ClearFormatting();
                wordapp.Selection.Find.Replacement.Text = "---Cardnumber automatically removed---";

                wordapp.Selection.Find.Execute(ref nullobj, ref nullobj, ref nullobj, ref matchWildCards,
                                    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                    ref replaceAll, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                doc.Save();
            }
            catch(Exception ex)
            {
                string s = ex.ToString();
                if( wordapp != null )
                {
                    //wordapp.Documents.Close( ref nullobj, ref nullobj, ref nullobj );
                    wordapp.Quit( ref nullobj, ref nullobj, ref nullobj );
                }
            }
        }
    }
}

但是 - 运行时出现异常:“System.Runtime.InteropServices.COMException (0x800A15B8):查找内容文本包含无效的模式匹配表达式”。

我认为这可能与我发送到Word的字符有关,所以我之前将\d与[0-9]交换。但没有变化。如果我使用 [0-9]{16} 运行,它会将 1234567891012345 替换为我要使用的字符串。

有人可以帮我吗?我是否必须使用许多不同的正则表达式进行搜索才能管理文档,或者可以使用一个简单的正则表达式(例如我已有的正则表达式)来完成此操作吗?

I am creating a small application that will open a word document, scan it for a credit card number (different patterns), replace the text, save and close the document.

My code is fairly simple:

using System;
using System.IO;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Text;

using Word = Microsoft.Office.Interop.Word;

namespace ParseFilesAndRemoveRegExp
{
    class Program
    {
        static void Main(string[] args)
        {
            FileManagement m = new FileManagement();
            m.OpenSearchAndReplace();
        }
    }

    class FileManagement
    {
        Word.Application wordapp;

        public FileManagement()
        {
            try
            {
                wordapp = new Word.Application();
            }
            catch(Exception ex)
            {
                if (ex != null)
                {
                    string s = ex.ToString();
                }
            }
        }

        internal void OpenSearchAndReplace()
        {
            object nullobj = System.Reflection.Missing.Value;
            try
            { 
                object filename = @"c:\\temp\\document.docx";
                object replaceAll = Word.WdReplace.wdReplaceAll;

                object matchWildCards = true;
                object readOnly = false;
                object isVisible = false;

                Word.Document doc = wordapp.Documents.Open( ref filename, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, 
                                                            ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                                            ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                doc.Activate();
                wordapp.Selection.Find.ClearFormatting();

                //wordapp.Selection.Find.Text = "[0-9]{16}";
                wordapp.Selection.Find.Text = "\b(?:[0-9][ -]*?){13,16}\b";
                wordapp.Selection.Find.Replacement.ClearFormatting();
                wordapp.Selection.Find.Replacement.Text = "---Cardnumber automatically removed---";

                wordapp.Selection.Find.Execute(ref nullobj, ref nullobj, ref nullobj, ref matchWildCards,
                                    ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                    ref replaceAll, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
                doc.Save();
            }
            catch(Exception ex)
            {
                string s = ex.ToString();
                if( wordapp != null )
                {
                    //wordapp.Documents.Close( ref nullobj, ref nullobj, ref nullobj );
                    wordapp.Quit( ref nullobj, ref nullobj, ref nullobj );
                }
            }
        }
    }
}

However - I get an exception when I run it: "System.Runtime.InteropServices.COMException (0x800A15B8): The Find What text contains a Pattern Match expression which is not valid".

I thought this might have something to do with the characters I sent to Word, so I previously exchanged \d with [0-9]. But no change. If I run with [0-9]{16}, it replaces the 1234567891012345 with the string I want to use.

Can anyone help me out here? Do I have to search with a number of different regex to manage a document, or can this be done with one simple regex like the one I already have?

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(5

红玫瑰 2024-08-27 12:33:33

尝试使用 \\b 而不是 \b。否则,字符串解析器将尝试将 ascii 代码 007(bell)放入字符串中,并且您将不会获得匹配。

Try \\b instead of \b. Otherwise, the string parser will try to put the ascii code 007 (bell) in the string and you won't get a match.

静若繁花 2024-08-27 12:33:33

您是否尝试过转义?:

wordapp.Selection.Find.Text = @"\b(?:[0-9][ -]*?){13,16}\b"; 

如果这不起作用,您需要从一个简单的正则表达式(或者实际上只是一个纯文本单词)开始,验证它是否有效,然后分阶段构建正则表达式。

Have you tried escaping?:

wordapp.Selection.Find.Text = @"\b(?:[0-9][ -]*?){13,16}\b"; 

If that doesn't work, you need to start with a simple Regular Expression (or in fact just a plain text word), verify it works and then build up the RegEx in stages.

百合的盛世恋 2024-08-27 12:33:33

以非常简单的方式做到这一点给了我一些有效的东西:

for (int i = 0; i < 3; ++i)
            { 
                if( i == 0 )
                    wordapp.Selection.Find.Text = "[0-9]{16}";
                else if( i == 1 )
                    wordapp.Selection.Find.Text = "[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}";
                else if( i == 2 )
                    wordapp.Selection.Find.Text = "[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}";

                wordapp.Selection.Find.Execute( ref nullobj, ref nullobj, ref nullobj, ref matchWildCards,
                                                ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                                ref replaceAll, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
            }

这不是一个非常好的设置,但是嘿 - 它有效。删除了 XXXXXXXXXXXXXXXX、XXXX XXXX XXXX XXXX 和 XXXX-XXXX-XXXX-XXXX 等号码。如果需要的话我会添加其他的。

Doing it the very simple way gave me something that worked:

for (int i = 0; i < 3; ++i)
            { 
                if( i == 0 )
                    wordapp.Selection.Find.Text = "[0-9]{16}";
                else if( i == 1 )
                    wordapp.Selection.Find.Text = "[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}";
                else if( i == 2 )
                    wordapp.Selection.Find.Text = "[0-9]{4} [0-9]{4} [0-9]{4} [0-9]{4}";

                wordapp.Selection.Find.Execute( ref nullobj, ref nullobj, ref nullobj, ref matchWildCards,
                                                ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
                                                ref replaceAll, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
            }

It is not a very nice setup, but hey - it works. Removed numbers like XXXXXXXXXXXXXXXX, XXXX XXXX XXXX XXXX and XXXX-XXXX-XXXX-XXXX. I will add others if necessary.

挽手叙旧 2024-08-27 12:33:33

我的猜测是 Word 有自己的正则表达式风格。您是否尝试过在 Word 中打开文档并在“查找和替换”对话框中使用该正则表达式?

实际上,根据http://www.regexinference。 com/documentation/Microsoft-Word-Wildcards-as-Regular-Expressions.html,Word 不支持非捕获括号,因此您必须想出不同的解决方案。

My guess would be that Word has its own flavour of regex. Have you tried opening a document in Word and using that regex in the Find and Replace dialog?

Actually, according to http://www.regexinference.com/documentation/Microsoft-Word-Wildcards-as-Regular-Expressions.html, Word doesn't support non-capturing parenthesis, so you're going to have to come up with a different solution.

一念一轮回 2024-08-27 12:33:33

迄今为止,我们有以下最佳解决方案,超越了单行。
这不是ms word,但你肯定能得到你想要的。

private const string _creditCardPatternMatchingExpression = @"(?m:-[*]\w{2}\d{15,16})|(?m:CC\w{2}\d{15,16})|(?m:\d{15,16})|(\d{4}-\d{4}-\d{4}-\d{4})|(\d{4}-\d{6}-\d{5})";

        public static string CleanCreditCardData(this String contentThatMayHaveCreditCardData)
    {
        string initiallyCleanedUpData = Regex.Replace(contentThatMayHaveCreditCardData, _creditCardPatternMatchingExpression, "CCXXXXXXXXXXXXXX");
        string completeSpaceEnterCleanedUpVersion = initiallyCleanedUpData.ToLower().Replace("\r\n", "").Replace("\n", "").Replace(" ", "").Replace("-", "").Replace("<br>", "").Replace("<br />", "").Replace("<br/>", "").Replace(" ", "");
        if (Regex.IsMatch(completeSpaceEnterCleanedUpVersion,_creditCardPatternMatchingExpression))
            return Regex.Replace(completeSpaceEnterCleanedUpVersion, _creditCardPatternMatchingExpression, "CCXXXXXXXXXXXXXX");

        return initiallyCleanedUpData;
    }

We have the following as the best solution so far which goes beyond single line.
It is not ms word but you can get what you want for sure.

private const string _creditCardPatternMatchingExpression = @"(?m:-[*]\w{2}\d{15,16})|(?m:CC\w{2}\d{15,16})|(?m:\d{15,16})|(\d{4}-\d{4}-\d{4}-\d{4})|(\d{4}-\d{6}-\d{5})";

        public static string CleanCreditCardData(this String contentThatMayHaveCreditCardData)
    {
        string initiallyCleanedUpData = Regex.Replace(contentThatMayHaveCreditCardData, _creditCardPatternMatchingExpression, "CCXXXXXXXXXXXXXX");
        string completeSpaceEnterCleanedUpVersion = initiallyCleanedUpData.ToLower().Replace("\r\n", "").Replace("\n", "").Replace(" ", "").Replace("-", "").Replace("<br>", "").Replace("<br />", "").Replace("<br/>", "").Replace(" ", "");
        if (Regex.IsMatch(completeSpaceEnterCleanedUpVersion,_creditCardPatternMatchingExpression))
            return Regex.Replace(completeSpaceEnterCleanedUpVersion, _creditCardPatternMatchingExpression, "CCXXXXXXXXXXXXXX");

        return initiallyCleanedUpData;
    }
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文