2015-04-02 84 views
0

我在SQL Server中有1000个表,每个表都是从CSV文件创建的。每张表中的数据都是相似的,每张表代表不同的一天。在数据库中查找被复制/重复结构的表

我遇到的问题是表格的结构以及列的名称有很多变化。

但是有些表格确实有匹配的结构,我认为合并数据的一个好的起点是将这些数据合并在一起。

我一直在寻找一种方法来查询数据库,以便找到具有相同结构但尚未成功的这些表。

任何帮助将不胜感激。

+0

通常你会使用一个临时表导入数据,然后复制/移动到你的数据库的真实的表。 – 2015-04-02 21:15:08

+0

我最初打算做类似的事情。当我发现每个文件中有超过400列时,我改变了主意,并开始让他们进入数据库,同时我试图找出该做什么。 – 2015-04-02 21:18:28

回答

1

因此,如果表格是真正相同的,那么试试看。实际上,我使用它来创建插入语句,如果需要它可以删除旧表。

IF OBJECT_ID('dbo.table1') IS NOT NULL DROP TABLE dbo.table1; 
IF OBJECT_ID('dbo.table2') IS NOT NULL DROP TABLE dbo.table2; 
IF OBJECT_ID('dbo.table3') IS NOT NULL DROP TABLE dbo.table3; 
IF OBJECT_ID('dbo.table4') IS NOT NULL DROP TABLE dbo.table4; 
IF OBJECT_ID('dbo.table5') IS NOT NULL DROP TABLE dbo.table5; 

CREATE TABLE table1 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME,AvgScore NUMERIC(18,6)); --table1 
CREATE TABLE table2 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME,AvgScore NUMERIC(18,6)); --matches table1 
CREATE TABLE table3 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME); --table3 
CREATE TABLE table4 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME); --matches table3 
CREATE TABLE table5 (ID INT,FirstName VARCHAR(25),LastName NVARCHAR(25),EntryDate DATETIME,AvgScore NUMERIC(18,6)); --matches table1 




WITH CTE_matching_Tables 
AS 
(
    SELECT 
      A.TABLE_NAME primaryTable, 
      A.total_columns, 
      COUNT(*) AS matching_columns, 
      B.TABLE_NAME AS matchedTable 
    FROM  (SELECT *, MAX(ORDINAL_POSITION) OVER (PARTITION BY Table_NAME) AS total_columns FROM INFORMATION_SCHEMA.COLUMNS) A 
    INNER JOIN (SELECT *, MAX(ORDINAL_POSITION) OVER (PARTITION BY Table_NAME) AS total_columns FROM INFORMATION_SCHEMA.COLUMNS) B 
    ON  A.TABLE_NAME < B.TABLE_NAME 
     AND A.ORDINAL_POSITION = B.ORDINAL_POSITION 
     AND A.total_columns = B.total_columns 
     AND A.COLUMN_NAME = B.COLUMN_NAME 
     AND A.DATA_TYPE = B.DATA_TYPE 
     AND A.IS_NULLABLE = B.IS_NULLABLE 
     AND (  (A.CHARACTER_MAXIMUM_LENGTH = B.CHARACTER_MAXIMUM_LENGTH) 
       OR (A.CHARACTER_MAXIMUM_LENGTH IS NULL AND B.CHARACTER_MAXIMUM_LENGTH IS NULL) 
      ) 
     AND (  (A.NUMERIC_PRECISION = B.NUMERIC_PRECISION) 
       OR (A.NUMERIC_PRECISION IS NULL AND B.NUMERIC_PRECISION IS NULL) 
      ) 
     AND (  (A.NUMERIC_SCALE = B.NUMERIC_SCALE) 
       OR (A.NUMERIC_SCALE IS NULL AND B.NUMERIC_SCALE IS NULL) 
      ) 
     AND (  (A.DATETIME_PRECISION = B.DATETIME_PRECISION) 
       OR (A.DATETIME_PRECISION IS NULL AND B.DATETIME_PRECISION IS NULL) 
      ) 
    GROUP BY A.TABLE_NAME,A.total_columns,B.TABLE_NAME 
    HAVING A.total_columns = COUNT(*) 
) 

--CTE has all table matches. I find the lowest occurring primaryTable for each matchedTable 
    --That way in my case table2 and table 5 insert into table 1 even though table2 and table5 also match 
SELECT 'INSERT INTO ' + MIN(primaryTable) + ' SELECT * FROM ' + matchedTable + '; DROP TABLE ' + matchedTable + ';' 
FROM CTE_matching_Tables 
GROUP BY matchedTable 

结果:

INSERT INTO table1 SELECT * FROM table2; DROP TABLE table2; 
INSERT INTO table3 SELECT * FROM table4; DROP TABLE table4; 
INSERT INTO table1 SELECT * FROM table5; DROP TABLE table5; 
+0

这很好,谢谢。然而,我必须修改它以包含一个检查,即按名称匹配的列数等于列总数。 – 2015-04-04 20:51:00

+0

哦,很好。我删除了该行,并意外忘记重新添加它。 – Stephan 2015-04-04 21:01:24

1

以下代码检查包含确切列数的表以及列类型是否匹配。请注意,订单并不重要。例如,如果您有两个这样的表格:

Table01 
Column01 INT 
Column02 BIT 

Table02 
Column01 BIT 
Column02 INT 

由于具有相同的结构,因此将会进行匹配。


下面的代码很简单 - 为每个表我们正在创造CSV清单,它的列类型。

DECLARE @DataSource TABLE 
(
    [name] SYSNAME 
    ,[value] VARCHAR(MAX) 
); 

INSERT INTO @DataSource ([name], [value]) 
SELECT T.[name]                 
     ,ColumnsTypesCSV.[value] 
FROM [sys].[tables] T 
CROSS APPLY 
(
    SELECT STUFF 
    (
     (
      SELECT ',' + CAST([system_type_id] AS VARCHAR(12)) 
      FROM [sys].[columns] C 
      WHERE T.[object_id] = C.[object_id] 
      ORDER BY [system_type_id] 
      FOR XML PATH(''), TYPE 
     ).value('.', 'VARCHAR(MAX)') 
     ,1 
     ,1 
     ,'' 
    ) 
) ColumnsTypesCSV ([value]); 

从表中选择看起来是这样的:

enter image description here

现在,我们要做同样的事情,但这次的分组由列类型CSV列表中进行:

SELECT DS.[value] 
     ,NamesCSV.[value] 
FROM @DataSource DS 
CROSS APPLY 
(
    SELECT STUFF 
    (
     (
      SELECT ',' + [name] 
      FROM @DataSource D 
      WHERE DS.[value] = D.[value] 
      ORDER BY [name] 
      FOR XML PATH(''), TYPE 
     ).value('.', 'VARCHAR(MAX)') 
     ,1 
     ,1 
     ,'' 
    ) 
) NamesCSV ([value]); 

我在AdventureWorks2012数据库中测试这段代码,它实际上发现表中匹配的表:

enter image description here

当然,这只是一个起点。你也可以检查其他的东西。例如,对于每一列类型ID,您可以添加如果列是NULLNOT NULL这样的:

TYPEID|NOTNULL,TYPEID|NULL... 
1

你会在信息视图INFORMATION_SCHEMA.COLUMNS找到大量的数据。

这会给你(除其他外)表名,列顺序,列名和列定义。

因此,举例来说,你可以做这样的事情:

; 
-- Create a list of table pairs. If you have reason to believe that 
-- some tables are more likely to be similar than others, you can 
-- modify this CTE as you need to. 
with A as (
    select T1.table_name 
     , t2.TABLE_NAME as other_table_Name 
    from information_Schema.TABLES t1 
     join information_schema.tables t2 
      on t1.TABLE_NAME < t2.TABLE_NAME 
) 
-- Pick all the pairs of table names ... 
select * 
from  A 
where NOT exists (
    -- where the first table does NOT have any columns ... 
    select 1 
    from INFORMATION_SCHEMA.columns c1 
    where A.TABLE_NAME = C1.TABLE_NAME 
     and not exists (
     -- ... that are NOT found in the second table ... 
      select 1 
      from INFORMATION_SCHEMA.columns c2 
      where c2.Table_Name = A.other_table_Name 
       AND c1.ordinal_position = c2.ordinal_position 
       and c1.data_type = c2.data_type 
       and ((c1.CHARACTER_MAXIMUM_LENGTH is null and 
        c2.CHARACTER_MAXIMUM_LENGTH is null) or    
        c1.CHARACTER_MAXIMUM_LENGTH = c2.CHARACTER_MAXIMUM_LENGTH) 
     ) 
    ) 
    and NOT exists (
    -- ... and the second table doesn't have any columns ... 
     select 1 
     from INFORMATION_SCHEMA.columns c1 
     where A.OTHER_TABLE_NAME = C1.TABLE_NAME 
      and not exists (
      -- that are not also found in the first table! 
       select 1 
       from INFORMATION_SCHEMA.columns c2 
       where c2.Table_Name = A.TABLE_NAME 
        AND c1.ordinal_position = c2.ordinal_position 
        and c1.data_type = c2.data_type 
        and ((c1.CHARACTER_MAXIMUM_LENGTH is null and 
         c2.CHARACTER_MAXIMUM_LENGTH is null) or 
         c1.CHARACTER_MAXIMUM_LENGTH = c2.CHARACTER_MAXIMUM_LENGTH) 
     ) 
    ) 
1

我使用校验和INFORMATION_SCHEMA.COLUMNS一堆列。这将给你一个表和任何具有相同幻数(总和校验和)的表的匹配。

declare @s1 sysname 
declare @n1 sysname 
declare @olds1 sysname 
declare @oldn1 sysname 
declare @curmagicnum decimal(18,0) 

if OBJECT_ID('tempdb..#alltables','U') is not null 
    drop table #alltables 

create table #alltables (schema_name sysname, 
        table_name sysname, 
        magicnum decimal(18,0)) 


select top 1 @s1 = TABLE_SCHEMA, @n1=table_name from INFORMATION_SCHEMA.TABLES order by TABLE_SCHEMA,table_name 

    while (1=1) 
    begin 

    select @curmagicnum= SUM(CAST(
     CHECKSUM (COLUMN_NAME,COLUMN_DEFAULT, IS_NULLABLE,  
     DATA_TYPE, CHARACTER_MAXIMUM_LENGTH, 
     NUMERIC_PRECISION, NUMERIC_SCALE, DATETIME_PRECISION) 
as decimal(18,0))) 

from INFORMATION_SCHEMA.columns 
where TABLE_NAME = @n1 and [email protected] 

    insert into #alltables values (@s1,@n1,@curmagicnum) 

    set @oldn1 = @n1 
    set @olds1 = @s1 
    select top 1 @s1 = TABLE_SCHEMA, @n1=table_name 



from INFORMATION_SCHEMA.TABLES 
     where TABLE_SCHEMA+'.'+TABLE_NAME> @s1+'.'[email protected] 
     order by TABLE_SCHEMA,table_name 
     if @@ROWCOUNT=0 
      break 

    end 
    ; 


    with t1 as (select *,ROW_NUMBER() over (PARTITION by magicnum order by table_name) as count1 from #alltables) 

    select schema_name,table_name,magicnum 
    from #alltables 
    where magicnum in (select magicnum from t1 where count1> 1) 
     order by magicnum,table_name