如何建立一个单词表

发布于 2024-09-24 02:09:16 字数 7974 浏览 5 评论 0原文

所以现在我想制作爱沙尼亚语单词列表~大约 2000 万个小写的独特单词。要获取单词列表的输入,可以使用爱沙尼亚语料库。语料库文件采用文本编码倡议 (TEI) 格式。我尝试使用正则表达式来查找单词。

这就是我所做的:它效率低下,mcv 完全混乱,如果单词的哈希集无法放入内存,它就会刹车,它不知道输入编码 - 所以像 š 这样的字母可能会出现问题,它不会显示估计的完成时间,有些控件有默认名称,有些没有,它不使用多任务处理(不确定是否应该),它使用一些奇怪的修复和大量锁定界面,以便它看起来不会“冻结”。至少它很短,你几乎没有注意到没有评论。

优点是,它几乎可以从 .tei、.txt、.csv、smgl、xhtml 或任何类似格式的输入中读取单词,不会出现很多错误。

现在你知道我想做什么,我如何尝试做它(有什么问题),并且我只是想找出如何做它(用最少的体力劳动)。

图片示例:

alt text

代码示例 & Gui

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Data.SqlClient;
using System.IO;
using System.Text.RegularExpressions;

namespace Reader
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }


        private void listView1_DragEnter(object sender, DragEventArgs e)
        {
            if (e.Data.GetDataPresent(DataFormats.FileDrop, false) == true)
            {
                e.Effect = DragDropEffects.All;
            }  
        }

        private void listView1_DragDrop(object sender, DragEventArgs e)
        {
            setguiLock(true);
            this.loading.Visible = true;
            ignorechecking = true;
            string[] files = (string[])e.Data.GetData(DataFormats.FileDrop, false);
            Dictionary<String, ListViewGroup> listviewgroups = new Dictionary<string,ListViewGroup>();

            int filenamesi = 0;

            foreach (string file in files)
            {
                progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, files.Length);
                Application.DoEvents();
               if (File.Exists(file))
                {
                    FileInfo ff = new System.IO.FileInfo(file);
                    if (!listviewgroups.ContainsKey(ff.DirectoryName))
                    {
                        listviewgroups.Add(ff.DirectoryName, new ListViewGroup(ff.DirectoryName, HorizontalAlignment.Left));
                        listView1.Groups.Add(listviewgroups[ff.DirectoryName]);
                    }
                    ListViewItem item = new ListViewItem(ff.Name);
                    listviewgroups[ff.DirectoryName].Items.Add(item);
                    item.Checked = true;

                    item.SubItems.Add("" +((int)ff.Length/1024)+" KB");

                  //  item.Group.Header = ff.DirectoryName;
                  //  listviewgroups[ff.DirectoryName].Items.Add(item);
                    listView1.Items.Add(item);
               }
            }
            setguiLock(false);
            ignorechecking = false;
            this.loading.Visible = false;
            updatechecked();
        }

        private void listView1_ItemChecked(object sender, ItemCheckedEventArgs e)
        {
            updatechecked();
        }
        private bool ignorechecking = false;
        private void updatechecked(){
            if (ignorechecking)
                return;
            long size = 0;
            int count = 0;
            foreach (ListViewItem item in this.listView1.Items)
            {
                if (item.Checked)
                {
                    count++;
                    size += Int32.Parse(item.SubItems[1].Text.Split(" ".ToArray())[0]); 
                }
            }
            this.text1.Text = ""+count;
            this.text2.Text = ""+size + " KB";
        }
        private void putHashset(HashSet<string> d, string filename)
        {
            StringBuilder sb = new StringBuilder();
            foreach (string key in d) 
                sb.Append(key).Append("\n");

            File.WriteAllText(filename, sb.ToString());
        }
        private HashSet<string> getHashset(string filename)
        {
            return new HashSet<string>(new Regex("\\n+").Split(File.ReadAllText(filename)));
        }

        private void removefilefromlistview(string fullfilename) {
            foreach (ListViewItem item in this.listView1.Items)
            {
                String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                if (fullfilename.CompareTo(file) == 0)
                {
                    item.Checked = false;
                    this.listView1.Items.Remove(item);
                }
            }
        }
        private void starter(object sender, EventArgs e)
        {
            HashSet<string> filenames = new HashSet<string>();
            StringBuilder data = null;

            setguiLock(true);
            this.time2.Text = "";
            this.time1.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);

            foreach (ListViewItem item in this.listView1.Items) {
                if (item.Checked) {
                    String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                    if (File.Exists(file))
                        filenames.Add(file);
                }
            }

            string outputfile = output.Text;
            HashSet<string> words = null;
            if (File.Exists(output.Text))
                words = getHashset(outputfile);
            else
                words = new HashSet<string>();

            int filenamesnr = filenames.Count;
            int filenamesi = 0;
            foreach (String str in filenames){
                progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, filenamesnr);
                Application.DoEvents();
                data = new StringBuilder(System.IO.File.ReadAllText(str, Encoding.UTF7).ToLower());

                data = data.Replace("&auml;", "ä");
                data = data.Replace("&ouml;", "ö");
                data = data.Replace("&uuml;", "ü");
                data = data.Replace("&otilde;", "õ");

                String sdata = new Regex(@"<(.|\n)*?>|%[a-zA-Z0-9]+?;|&[#a-zA-Z0-9]+?;").Replace(data.ToString(), "");

                foreach (string word in new Regex("[^A-Za-zšžõäöüŠŽÕÄÖÜ]+").Split(sdata))
                    if(word.Length>1)
                            words.Add(word);

                removefilefromlistview(str);
            }
            progresslabel.Text = "Progress:";
            putHashset(words, outputfile);

            foreach (ListViewItem item in this.listView1.Items)
                if (item.Checked)
                {
                    item.Checked = false;
                    listView1.Items.Remove(item);
                }

            this.time2.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);
            setguiLock(false);
        }

        private void setguiLock(bool value){
            if(value){
                this.Enabled = false;
                this.button1.Enabled = false;
                this.listView1.Enabled = false;
                this.output.Enabled = false;
                this.openoutput.Enabled = false;
                this.progresslabel.Visible = true;
                this.Enabled = true;
            }else{
                this.Enabled = false;
                this.openoutput.Enabled = true;
                this.output.Enabled = true;
                this.listView1.Enabled = true;
                this.button1.Enabled = true;
                this.progresslabel.Visible = false;
                this.Enabled = true;
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            if (!File.Exists(output.Text))
                   File.WriteAllText(output.Text, " ");
            System.Diagnostics.Process.Start(output.Text);
        }
    }
}

So now I want to make Estonian wordlist ~about 20m unique words in lowercase. To get input for wordlist, corpus of Estonian can be used. Corpus files are in Text Encoding Initiative (TEI) format. I tried using regex to find the words.

This is what I made: it's inefficient, mcv is all messed up, it brakes if hashset of words can't fit in memory, it's not aware of inputs encoding - so probably letters like š make problems, it does not show estimated completion time, some controls have default names and some don't, it does not use multitasking (not sure if it should), it uses some weird fixes and lots of locking interface so that it would appear not 'frozen'. At least its so short, that you hardly notice there are no comments.

Upside is, that it can almost read words without many mistakes, from .tei, .txt, .csv, smgl, xhtml or any a like format inputs.

Now you know what I want to do, how I have tried doing it (with what problems), and again I'm just trying to find out how to do it (with minimal manual labor).

Image example:

alt text

Code example & Gui:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Data.SqlClient;
using System.IO;
using System.Text.RegularExpressions;

namespace Reader
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }


        private void listView1_DragEnter(object sender, DragEventArgs e)
        {
            if (e.Data.GetDataPresent(DataFormats.FileDrop, false) == true)
            {
                e.Effect = DragDropEffects.All;
            }  
        }

        private void listView1_DragDrop(object sender, DragEventArgs e)
        {
            setguiLock(true);
            this.loading.Visible = true;
            ignorechecking = true;
            string[] files = (string[])e.Data.GetData(DataFormats.FileDrop, false);
            Dictionary<String, ListViewGroup> listviewgroups = new Dictionary<string,ListViewGroup>();

            int filenamesi = 0;

            foreach (string file in files)
            {
                progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, files.Length);
                Application.DoEvents();
               if (File.Exists(file))
                {
                    FileInfo ff = new System.IO.FileInfo(file);
                    if (!listviewgroups.ContainsKey(ff.DirectoryName))
                    {
                        listviewgroups.Add(ff.DirectoryName, new ListViewGroup(ff.DirectoryName, HorizontalAlignment.Left));
                        listView1.Groups.Add(listviewgroups[ff.DirectoryName]);
                    }
                    ListViewItem item = new ListViewItem(ff.Name);
                    listviewgroups[ff.DirectoryName].Items.Add(item);
                    item.Checked = true;

                    item.SubItems.Add("" +((int)ff.Length/1024)+" KB");

                  //  item.Group.Header = ff.DirectoryName;
                  //  listviewgroups[ff.DirectoryName].Items.Add(item);
                    listView1.Items.Add(item);
               }
            }
            setguiLock(false);
            ignorechecking = false;
            this.loading.Visible = false;
            updatechecked();
        }

        private void listView1_ItemChecked(object sender, ItemCheckedEventArgs e)
        {
            updatechecked();
        }
        private bool ignorechecking = false;
        private void updatechecked(){
            if (ignorechecking)
                return;
            long size = 0;
            int count = 0;
            foreach (ListViewItem item in this.listView1.Items)
            {
                if (item.Checked)
                {
                    count++;
                    size += Int32.Parse(item.SubItems[1].Text.Split(" ".ToArray())[0]); 
                }
            }
            this.text1.Text = ""+count;
            this.text2.Text = ""+size + " KB";
        }
        private void putHashset(HashSet<string> d, string filename)
        {
            StringBuilder sb = new StringBuilder();
            foreach (string key in d) 
                sb.Append(key).Append("\n");

            File.WriteAllText(filename, sb.ToString());
        }
        private HashSet<string> getHashset(string filename)
        {
            return new HashSet<string>(new Regex("\\n+").Split(File.ReadAllText(filename)));
        }

        private void removefilefromlistview(string fullfilename) {
            foreach (ListViewItem item in this.listView1.Items)
            {
                String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                if (fullfilename.CompareTo(file) == 0)
                {
                    item.Checked = false;
                    this.listView1.Items.Remove(item);
                }
            }
        }
        private void starter(object sender, EventArgs e)
        {
            HashSet<string> filenames = new HashSet<string>();
            StringBuilder data = null;

            setguiLock(true);
            this.time2.Text = "";
            this.time1.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);

            foreach (ListViewItem item in this.listView1.Items) {
                if (item.Checked) {
                    String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                    if (File.Exists(file))
                        filenames.Add(file);
                }
            }

            string outputfile = output.Text;
            HashSet<string> words = null;
            if (File.Exists(output.Text))
                words = getHashset(outputfile);
            else
                words = new HashSet<string>();

            int filenamesnr = filenames.Count;
            int filenamesi = 0;
            foreach (String str in filenames){
                progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, filenamesnr);
                Application.DoEvents();
                data = new StringBuilder(System.IO.File.ReadAllText(str, Encoding.UTF7).ToLower());

                data = data.Replace("ä", "ä");
                data = data.Replace("ö", "ö");
                data = data.Replace("ü", "ü");
                data = data.Replace("õ", "õ");

                String sdata = new Regex(@"<(.|\n)*?>|%[a-zA-Z0-9]+?;|&[#a-zA-Z0-9]+?;").Replace(data.ToString(), "");

                foreach (string word in new Regex("[^A-Za-zšžõäöüŠŽÕÄÖÜ]+").Split(sdata))
                    if(word.Length>1)
                            words.Add(word);

                removefilefromlistview(str);
            }
            progresslabel.Text = "Progress:";
            putHashset(words, outputfile);

            foreach (ListViewItem item in this.listView1.Items)
                if (item.Checked)
                {
                    item.Checked = false;
                    listView1.Items.Remove(item);
                }

            this.time2.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);
            setguiLock(false);
        }

        private void setguiLock(bool value){
            if(value){
                this.Enabled = false;
                this.button1.Enabled = false;
                this.listView1.Enabled = false;
                this.output.Enabled = false;
                this.openoutput.Enabled = false;
                this.progresslabel.Visible = true;
                this.Enabled = true;
            }else{
                this.Enabled = false;
                this.openoutput.Enabled = true;
                this.output.Enabled = true;
                this.listView1.Enabled = true;
                this.button1.Enabled = true;
                this.progresslabel.Visible = false;
                this.Enabled = true;
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            if (!File.Exists(output.Text))
                   File.WriteAllText(output.Text, " ");
            System.Diagnostics.Process.Start(output.Text);
        }
    }
}

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(1

阿楠 2024-10-01 02:09:16

您需要为这项工作找到合适的工具。像这样的语言语料库中的数据和标记数量意味着您需要一个适当的 XML 感知索引解决方案。示例包括 eXist、XAIRA、CQP...

You need to get the right tool for the job. The quantity of data and markup in a linguistic corpus like this means you need a proper XML-aware indexing solution. Examples include eXist, XAIRA, CQP...

~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文