SQL - 确定列中最常出现的单词

发布于 2024-12-19 03:01:16 字数 166 浏览 4 评论 0原文

是否有一种简单的方法可以使用 T-SQL 或 VBA 确定列/字段中最常出现的单词?

我正在为两个给定的记录集开发一个模糊匹配系统,并希望生成一个匹配字符串,其中删除最常出现的单词。由于数据来自客户关系管理数据库,因此将删除“limited”、“ltd”、“plc”和“CORPORATION”等术语。

Is there an easy way of determining the most frequently occuring word in a column/field using T-SQL or VBA?

I am working on a fuzzy matching system for two given recordsets and would like to produce a matching string where the most frequently occuring words are removed. As the data is from a customer relations management database terms like "limited", "ltd", "plc" and "CORPORATION" would be removed.

如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

扫码二维码加入Web技术交流群

发布评论

需要 登录 才能够评论, 你可以免费 注册 一个本站的账号。

评论(2

放肆 2024-12-26 03:01:16

为 sql-server 2005+ 编写

分割函数:

create function f_split
(
  @a varchar(max), 
  @delimiter varchar(20)
)
RETURNS @t TABLE(substr varchar(200))
as
begin
set @a = @a + @delimiter
;with a as
(
  select cast(1 as bigint) f1, charindex(@delimiter, @a) f2
  where len(@a) > 0
  union all
  select f2 + (len(@delimiter)) + 1, charindex(@delimiter, @a, f2+1)
  from a
  where f2 > 0
)
insert @t
select substring(@a, f1, f2 - f1) from a
where f1 < f2
return
end
go

查询:

--testdata
declare @table table(name varchar(50))

insert @table values('bla bla bla ltd')
insert @table values('bla plc ltd')
insert @table values('more text CORPORATION')


declare @matchlist table(name varchar(50), replacement varchar(50))
insert @matchlist values('ltd', 'limited')
insert @matchlist values('plc', 'limited')
insert @matchlist values('CORPORATION', 'limited')

--query
select coalesce(m.replacement, a.substr) name, count(*) count from @table p
cross apply
(
  select substr from 
  dbo.f_split(p.name, ' ')
) a
left join
@matchlist m
on a.substr = m.name
group by coalesce(m.replacement, a.substr)
order by 2 desc

结果:

name  count
----  -----
bla       4
limited   4
more      1
text      1

Written for sql-server 2005+

Function to split:

create function f_split
(
  @a varchar(max), 
  @delimiter varchar(20)
)
RETURNS @t TABLE(substr varchar(200))
as
begin
set @a = @a + @delimiter
;with a as
(
  select cast(1 as bigint) f1, charindex(@delimiter, @a) f2
  where len(@a) > 0
  union all
  select f2 + (len(@delimiter)) + 1, charindex(@delimiter, @a, f2+1)
  from a
  where f2 > 0
)
insert @t
select substring(@a, f1, f2 - f1) from a
where f1 < f2
return
end
go

Query:

--testdata
declare @table table(name varchar(50))

insert @table values('bla bla bla ltd')
insert @table values('bla plc ltd')
insert @table values('more text CORPORATION')


declare @matchlist table(name varchar(50), replacement varchar(50))
insert @matchlist values('ltd', 'limited')
insert @matchlist values('plc', 'limited')
insert @matchlist values('CORPORATION', 'limited')

--query
select coalesce(m.replacement, a.substr) name, count(*) count from @table p
cross apply
(
  select substr from 
  dbo.f_split(p.name, ' ')
) a
left join
@matchlist m
on a.substr = m.name
group by coalesce(m.replacement, a.substr)
order by 2 desc

Result:

name  count
----  -----
bla       4
limited   4
more      1
text      1
顾冷 2024-12-26 03:01:16

希望这对您有用。

   create table sometable
    ( id integer not null primary key identity
    , mYWords text not null
    );
    insert into sometable (mYWords) 
    values ('a word that appears maximum number of times in a column')
    insert into sometable (mYWords) 
    values ('Is it possible to get words from text columns in a sql server database')
    insert into sometable (mYWords) 
    values ('This could solve my problem if reffered column contain only single word')
    insert into sometable (mYWords) 
    values ('that''s going to require that you split out every word in the column individually')
    insert into sometable (mYWords) 
    values ('the query will definitely not be easy to write')
    insert into sometable (mYWords) 
    values ('Please read the sticky at the top of the board')
    insert into sometable (mYWords) 
    values ('The physical order of data in a database has no meaning')

GO

CREATE TABLE WordList (
      Word varchar(256)
    , WordId int IDENTITY(1,1)
    , Add_Dt datetime DEFAULT (GetDate()))
 GO

CREATE UNIQUE INDEX UnqueWords_PK ON WordList(Word)
GO

CREATE PROC isp_INS_WORD_LIST
AS
BEGIN
    SET NOCOUNT ON
    DECLARE @Words INT, @Pos INT, @x Int, @str varchar(256)
          , @word varchar(256), @start int, @end int, @exitstart int
    SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1

    DECLARE Cur1 CURSOR FOR SELECT mYWords FROM sometable
    OPEN Cur1
    FETCH NEXT FROM Cur1 INTO @str

    WHILE @@FETCH_STATUS = 0
      BEGIN
        WHILE (@x <> 0)
            BEGIN
                SET @x     = CHARINDEX(' ', @str, @Pos)
                IF @x <> 0
                  BEGIN 
                    SET @end   = @x - @start
                    SET @word  = SUBSTRING(@str,@start,@end)
                    IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
                        INSERT INTO WordList(Word) SELECT @word
                    -- SELECT @Word, @@ROWCOUNT,@@ERROR
                    -- SELECT @x, @Word, @start, @end, @str
                    SET @exitstart = @start + @end + 1
                    SET @Pos   = @x + 1
                    SET @start = @x + 1
                    SET @Words = @Words + 1
                  END
                IF @x = 0
                  BEGIN
                    SET @word  = SUBSTRING(@str,@exitstart,LEN(@str)-@exitstart+1)
                    IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
                        INSERT INTO WordList(Word) SELECT @word
                    -- SELECT @Word, @@ROWCOUNT,@@ERROR
                    -- SELECT @x, @Word, @exitstart, LEN(@str)-@exitstart, @str
                  END
            END
        FETCH NEXT FROM Cur1 INTO @str
        SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1
      END   

      CLOSE Cur1
      DEALLOCATE Cur1
      SET NOCOUNT OFF
    RETURN @Words
END
GO

EXEC isp_INS_WORD_LIST
GO

SELECT * FROM WordList ORDER BY Word
GO

DROP PROC isp_INS_WORD_LIST
DROP TABLE WordList, sometable
GO

Hope this will be useful to you.

   create table sometable
    ( id integer not null primary key identity
    , mYWords text not null
    );
    insert into sometable (mYWords) 
    values ('a word that appears maximum number of times in a column')
    insert into sometable (mYWords) 
    values ('Is it possible to get words from text columns in a sql server database')
    insert into sometable (mYWords) 
    values ('This could solve my problem if reffered column contain only single word')
    insert into sometable (mYWords) 
    values ('that''s going to require that you split out every word in the column individually')
    insert into sometable (mYWords) 
    values ('the query will definitely not be easy to write')
    insert into sometable (mYWords) 
    values ('Please read the sticky at the top of the board')
    insert into sometable (mYWords) 
    values ('The physical order of data in a database has no meaning')

GO

CREATE TABLE WordList (
      Word varchar(256)
    , WordId int IDENTITY(1,1)
    , Add_Dt datetime DEFAULT (GetDate()))
 GO

CREATE UNIQUE INDEX UnqueWords_PK ON WordList(Word)
GO

CREATE PROC isp_INS_WORD_LIST
AS
BEGIN
    SET NOCOUNT ON
    DECLARE @Words INT, @Pos INT, @x Int, @str varchar(256)
          , @word varchar(256), @start int, @end int, @exitstart int
    SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1

    DECLARE Cur1 CURSOR FOR SELECT mYWords FROM sometable
    OPEN Cur1
    FETCH NEXT FROM Cur1 INTO @str

    WHILE @@FETCH_STATUS = 0
      BEGIN
        WHILE (@x <> 0)
            BEGIN
                SET @x     = CHARINDEX(' ', @str, @Pos)
                IF @x <> 0
                  BEGIN 
                    SET @end   = @x - @start
                    SET @word  = SUBSTRING(@str,@start,@end)
                    IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
                        INSERT INTO WordList(Word) SELECT @word
                    -- SELECT @Word, @@ROWCOUNT,@@ERROR
                    -- SELECT @x, @Word, @start, @end, @str
                    SET @exitstart = @start + @end + 1
                    SET @Pos   = @x + 1
                    SET @start = @x + 1
                    SET @Words = @Words + 1
                  END
                IF @x = 0
                  BEGIN
                    SET @word  = SUBSTRING(@str,@exitstart,LEN(@str)-@exitstart+1)
                    IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
                        INSERT INTO WordList(Word) SELECT @word
                    -- SELECT @Word, @@ROWCOUNT,@@ERROR
                    -- SELECT @x, @Word, @exitstart, LEN(@str)-@exitstart, @str
                  END
            END
        FETCH NEXT FROM Cur1 INTO @str
        SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1
      END   

      CLOSE Cur1
      DEALLOCATE Cur1
      SET NOCOUNT OFF
    RETURN @Words
END
GO

EXEC isp_INS_WORD_LIST
GO

SELECT * FROM WordList ORDER BY Word
GO

DROP PROC isp_INS_WORD_LIST
DROP TABLE WordList, sometable
GO
~没有更多了~
我们使用 Cookies 和其他技术来定制您的体验包括您的登录状态等。通过阅读我们的 隐私政策 了解更多相关信息。 单击 接受 或继续使用网站,即表示您同意使用 Cookies 和您的相关数据。
原文