如何建立一个单词表

发布于 2024-09-24 02:09:16 字数 7974 浏览 5 评论 0原文

所以现在我想制作爱沙尼亚语单词列表~大约 2000 万个小写的独特单词。要获取单词列表的输入，可以使用爱沙尼亚语料库。语料库文件采用文本编码倡议 (TEI) 格式。我尝试使用正则表达式来查找单词。

这就是我所做的：它效率低下，mcv 完全混乱，如果单词的哈希集无法放入内存，它就会刹车，它不知道输入编码 - 所以像 š 这样的字母可能会出现问题，它不会显示估计的完成时间，有些控件有默认名称，有些没有，它不使用多任务处理（不确定是否应该），它使用一些奇怪的修复和大量锁定界面，以便它看起来不会“冻结”。至少它很短，你几乎没有注意到没有评论。

优点是，它几乎可以从 .tei、.txt、.csv、smgl、xhtml 或任何类似格式的输入中读取单词，不会出现很多错误。

现在你知道我想做什么，我如何尝试做它（有什么问题），并且我只是想找出如何做它（用最少的体力劳动）。

图片示例：

alt text

代码示例 & Gui：

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Data.SqlClient;
using System.IO;
using System.Text.RegularExpressions;

namespace Reader
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }


        private void listView1_DragEnter(object sender, DragEventArgs e)
        {
            if (e.Data.GetDataPresent(DataFormats.FileDrop, false) == true)
            {
                e.Effect = DragDropEffects.All;
            }  
        }

        private void listView1_DragDrop(object sender, DragEventArgs e)
        {
            setguiLock(true);
            this.loading.Visible = true;
            ignorechecking = true;
            string[] files = (string[])e.Data.GetData(DataFormats.FileDrop, false);
            Dictionary<String, ListViewGroup> listviewgroups = new Dictionary<string,ListViewGroup>();

            int filenamesi = 0;

            foreach (string file in files)
            {
                progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, files.Length);
                Application.DoEvents();
               if (File.Exists(file))
                {
                    FileInfo ff = new System.IO.FileInfo(file);
                    if (!listviewgroups.ContainsKey(ff.DirectoryName))
                    {
                        listviewgroups.Add(ff.DirectoryName, new ListViewGroup(ff.DirectoryName, HorizontalAlignment.Left));
                        listView1.Groups.Add(listviewgroups[ff.DirectoryName]);
                    }
                    ListViewItem item = new ListViewItem(ff.Name);
                    listviewgroups[ff.DirectoryName].Items.Add(item);
                    item.Checked = true;

                    item.SubItems.Add("" +((int)ff.Length/1024)+" KB");

                  //  item.Group.Header = ff.DirectoryName;
                  //  listviewgroups[ff.DirectoryName].Items.Add(item);
                    listView1.Items.Add(item);
               }
            }
            setguiLock(false);
            ignorechecking = false;
            this.loading.Visible = false;
            updatechecked();
        }

        private void listView1_ItemChecked(object sender, ItemCheckedEventArgs e)
        {
            updatechecked();
        }
        private bool ignorechecking = false;
        private void updatechecked(){
            if (ignorechecking)
                return;
            long size = 0;
            int count = 0;
            foreach (ListViewItem item in this.listView1.Items)
            {
                if (item.Checked)
                {
                    count++;
                    size += Int32.Parse(item.SubItems[1].Text.Split(" ".ToArray())[0]); 
                }
            }
            this.text1.Text = ""+count;
            this.text2.Text = ""+size + " KB";
        }
        private void putHashset(HashSet<string> d, string filename)
        {
            StringBuilder sb = new StringBuilder();
            foreach (string key in d) 
                sb.Append(key).Append("\n");

            File.WriteAllText(filename, sb.ToString());
        }
        private HashSet<string> getHashset(string filename)
        {
            return new HashSet<string>(new Regex("\\n+").Split(File.ReadAllText(filename)));
        }

        private void removefilefromlistview(string fullfilename) {
            foreach (ListViewItem item in this.listView1.Items)
            {
                String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                if (fullfilename.CompareTo(file) == 0)
                {
                    item.Checked = false;
                    this.listView1.Items.Remove(item);
                }
            }
        }
        private void starter(object sender, EventArgs e)
        {
            HashSet<string> filenames = new HashSet<string>();
            StringBuilder data = null;

            setguiLock(true);
            this.time2.Text = "";
            this.time1.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);

            foreach (ListViewItem item in this.listView1.Items) {
                if (item.Checked) {
                    String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                    if (File.Exists(file))
                        filenames.Add(file);
                }
            }

            string outputfile = output.Text;
            HashSet<string> words = null;
            if (File.Exists(output.Text))
                words = getHashset(outputfile);
            else
                words = new HashSet<string>();

            int filenamesnr = filenames.Count;
            int filenamesi = 0;
            foreach (String str in filenames){
                progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, filenamesnr);
                Application.DoEvents();
                data = new StringBuilder(System.IO.File.ReadAllText(str, Encoding.UTF7).ToLower());

                data = data.Replace("&auml;", "ä");
                data = data.Replace("&ouml;", "ö");
                data = data.Replace("&uuml;", "ü");
                data = data.Replace("&otilde;", "õ");

                String sdata = new Regex(@"<(.|\n)*?>|%[a-zA-Z0-9]+?;|&[#a-zA-Z0-9]+?;").Replace(data.ToString(), "");

                foreach (string word in new Regex("[^A-Za-zšžõäöüŠŽÕÄÖÜ]+").Split(sdata))
                    if(word.Length>1)
                            words.Add(word);

                removefilefromlistview(str);
            }
            progresslabel.Text = "Progress:";
            putHashset(words, outputfile);

            foreach (ListViewItem item in this.listView1.Items)
                if (item.Checked)
                {
                    item.Checked = false;
                    listView1.Items.Remove(item);
                }

            this.time2.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);
            setguiLock(false);
        }

        private void setguiLock(bool value){
            if(value){
                this.Enabled = false;
                this.button1.Enabled = false;
                this.listView1.Enabled = false;
                this.output.Enabled = false;
                this.openoutput.Enabled = false;
                this.progresslabel.Visible = true;
                this.Enabled = true;
            }else{
                this.Enabled = false;
                this.openoutput.Enabled = true;
                this.output.Enabled = true;
                this.listView1.Enabled = true;
                this.button1.Enabled = true;
                this.progresslabel.Visible = false;
                this.Enabled = true;
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            if (!File.Exists(output.Text))
                   File.WriteAllText(output.Text, " ");
            System.Diagnostics.Process.Start(output.Text);
        }
    }
}

原文

So now I want to make Estonian wordlist ~about 20m unique words in lowercase. To get input for wordlist, corpus of Estonian can be used. Corpus files are in Text Encoding Initiative (TEI) format. I tried using regex to find the words.

This is what I made: it's inefficient, mcv is all messed up, it brakes if hashset of words can't fit in memory, it's not aware of inputs encoding - so probably letters like š make problems, it does not show estimated completion time, some controls have default names and some don't, it does not use multitasking (not sure if it should), it uses some weird fixes and lots of locking interface so that it would appear not 'frozen'. At least its so short, that you hardly notice there are no comments.

Upside is, that it can almost read words without many mistakes, from .tei, .txt, .csv, smgl, xhtml or any a like format inputs.

Now you know what I want to do, how I have tried doing it (with what problems), and again I'm just trying to find out how to do it (with minimal manual labor).

Image example:

alt text

Code example & Gui:

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Data.SqlClient;
using System.IO;
using System.Text.RegularExpressions;

namespace Reader
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }


        private void listView1_DragEnter(object sender, DragEventArgs e)
        {
            if (e.Data.GetDataPresent(DataFormats.FileDrop, false) == true)
            {
                e.Effect = DragDropEffects.All;
            }  
        }

        private void listView1_DragDrop(object sender, DragEventArgs e)
        {
            setguiLock(true);
            this.loading.Visible = true;
            ignorechecking = true;
            string[] files = (string[])e.Data.GetData(DataFormats.FileDrop, false);
            Dictionary<String, ListViewGroup> listviewgroups = new Dictionary<string,ListViewGroup>();

            int filenamesi = 0;

            foreach (string file in files)
            {
                progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, files.Length);
                Application.DoEvents();
               if (File.Exists(file))
                {
                    FileInfo ff = new System.IO.FileInfo(file);
                    if (!listviewgroups.ContainsKey(ff.DirectoryName))
                    {
                        listviewgroups.Add(ff.DirectoryName, new ListViewGroup(ff.DirectoryName, HorizontalAlignment.Left));
                        listView1.Groups.Add(listviewgroups[ff.DirectoryName]);
                    }
                    ListViewItem item = new ListViewItem(ff.Name);
                    listviewgroups[ff.DirectoryName].Items.Add(item);
                    item.Checked = true;

                    item.SubItems.Add("" +((int)ff.Length/1024)+" KB");

                  //  item.Group.Header = ff.DirectoryName;
                  //  listviewgroups[ff.DirectoryName].Items.Add(item);
                    listView1.Items.Add(item);
               }
            }
            setguiLock(false);
            ignorechecking = false;
            this.loading.Visible = false;
            updatechecked();
        }

        private void listView1_ItemChecked(object sender, ItemCheckedEventArgs e)
        {
            updatechecked();
        }
        private bool ignorechecking = false;
        private void updatechecked(){
            if (ignorechecking)
                return;
            long size = 0;
            int count = 0;
            foreach (ListViewItem item in this.listView1.Items)
            {
                if (item.Checked)
                {
                    count++;
                    size += Int32.Parse(item.SubItems[1].Text.Split(" ".ToArray())[0]); 
                }
            }
            this.text1.Text = ""+count;
            this.text2.Text = ""+size + " KB";
        }
        private void putHashset(HashSet<string> d, string filename)
        {
            StringBuilder sb = new StringBuilder();
            foreach (string key in d) 
                sb.Append(key).Append("\n");

            File.WriteAllText(filename, sb.ToString());
        }
        private HashSet<string> getHashset(string filename)
        {
            return new HashSet<string>(new Regex("\\n+").Split(File.ReadAllText(filename)));
        }

        private void removefilefromlistview(string fullfilename) {
            foreach (ListViewItem item in this.listView1.Items)
            {
                String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                if (fullfilename.CompareTo(file) == 0)
                {
                    item.Checked = false;
                    this.listView1.Items.Remove(item);
                }
            }
        }
        private void starter(object sender, EventArgs e)
        {
            HashSet<string> filenames = new HashSet<string>();
            StringBuilder data = null;

            setguiLock(true);
            this.time2.Text = "";
            this.time1.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);

            foreach (ListViewItem item in this.listView1.Items) {
                if (item.Checked) {
                    String file = item.Group.Header + "\\" + item.SubItems[0].Text;
                    if (File.Exists(file))
                        filenames.Add(file);
                }
            }

            string outputfile = output.Text;
            HashSet<string> words = null;
            if (File.Exists(output.Text))
                words = getHashset(outputfile);
            else
                words = new HashSet<string>();

            int filenamesnr = filenames.Count;
            int filenamesi = 0;
            foreach (String str in filenames){
                progresslabel.Text = string.Format("Progress: \t[ {0} / {1} ]", filenamesi++, filenamesnr);
                Application.DoEvents();
                data = new StringBuilder(System.IO.File.ReadAllText(str, Encoding.UTF7).ToLower());

                data = data.Replace("ä", "ä");
                data = data.Replace("ö", "ö");
                data = data.Replace("ü", "ü");
                data = data.Replace("õ", "õ");

                String sdata = new Regex(@"<(.|\n)*?>|%[a-zA-Z0-9]+?;|&[#a-zA-Z0-9]+?;").Replace(data.ToString(), "");

                foreach (string word in new Regex("[^A-Za-zšžõäöüŠŽÕÄÖÜ]+").Split(sdata))
                    if(word.Length>1)
                            words.Add(word);

                removefilefromlistview(str);
            }
            progresslabel.Text = "Progress:";
            putHashset(words, outputfile);

            foreach (ListViewItem item in this.listView1.Items)
                if (item.Checked)
                {
                    item.Checked = false;
                    listView1.Items.Remove(item);
                }

            this.time2.Text = String.Format("{0:d/M/yyyy HH:mm:ss}", DateTime.Now);
            setguiLock(false);
        }

        private void setguiLock(bool value){
            if(value){
                this.Enabled = false;
                this.button1.Enabled = false;
                this.listView1.Enabled = false;
                this.output.Enabled = false;
                this.openoutput.Enabled = false;
                this.progresslabel.Visible = true;
                this.Enabled = true;
            }else{
                this.Enabled = false;
                this.openoutput.Enabled = true;
                this.output.Enabled = true;
                this.listView1.Enabled = true;
                this.button1.Enabled = true;
                this.progresslabel.Visible = false;
                this.Enabled = true;
            }
        }

        private void button2_Click(object sender, EventArgs e)
        {
            if (!File.Exists(output.Text))
                   File.WriteAllText(output.Text, " ");
            System.Diagnostics.Process.Start(output.Text);
        }
    }
}

分享到QQ

分享到微博