2013-05-07 73 views
1

我有两个数据表中包含数据类似如下: -TSQL链接数据分成匹配和不匹配的表

| id | name | dob |   | name | dob | 
|-------|------|----------|   |------|----------| 
| 12345 | ABC | 20010301 |   | ABC | 20010301 | - matching record 
| 45678 | DEF | 20010425 |   | XYZ | 20010301 | - unmatched record 

是否有可能写这两个表进行比较的查询,然后创建一个匹配和一个不匹配的表,只留下orignal表结构/数据?

Match Table  Unmatched Table 
| id | rank |  | id | rank | 
|-------|------|  |-------|------| 
| 12345 | 1 |  | 45678 | NULL | 

我用MERGE尝试,但我必须插入/更新一个源表,我已经打了我的天花板TSQL的方面 - 我也将处理超过3000万行数据集 - 任何意见/建议?
sql(字段不匹配,但原则是存在的)我到目前为止如下所示?

Create TABLE #Cohort ([ID] varchar(4),[match rank] int) 
INSERT INTO #Cohort ([ID],[match rank]) VALUES('aaaa',NULL) 
INSERT INTO #Cohort ([ID],[match rank]) VALUES('bbbb',NULL) 
INSERT INTO #Cohort ([ID],[match rank]) VALUES('cccc',NULL) 
INSERT INTO #Cohort ([ID],[match rank]) VALUES('dddd',NULL) 

Create TABLE #link ([ID] varchar(4),[match rank] int) 
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL) 
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL) 
INSERT INTO #link ([ID],[match rank]) VALUES('aaaa',NULL) 
INSERT INTO #link ([ID],[match rank]) VALUES(left(NEWID(),4),NULL) 

Create TABLE #Matches ([ID] varchar(4),[match rank] int) 
Create TABLE #Unmatched ([ID] varchar(4),[match rank] int) 

MERGE #Cohort tg 
USING (SELECT distinct c.[ID], 1 as [match rank] 
     from #Cohort c 
     INNER JOIN #link as h on c.[ID]=h.[ID]) sc 
ON (tg.[ID] = sc.[ID]) 
WHEN NOT MATCHED BY TARGET 
    THEN INSERT([ID],[match rank]) VALUES(sc.[ID],sc.[match rank]) 
WHEN NOT MATCHED BY SOURCE 
    THEN DELETE 
OUTPUT Deleted.* INTO #Unmatched; 

回答

0

使用CTE,最后你将在#Matched匹配的行和#Unmatched不匹配的行。就目前而言,您的MERGE语句将删除#cohort表中的行,只剩下aaaa值。

CREATE TABLE #Cohort ([ID] VARCHAR(4),[MATCH RANK] INT) 
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('aaaa',NULL) 
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('bbbb',NULL) 
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('cccc',NULL) 
INSERT INTO #Cohort ([ID],[MATCH RANK]) VALUES('dddd',NULL) 

CREATE TABLE #link ([ID] VARCHAR(4),[MATCH RANK] INT) 
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL) 
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL) 
INSERT INTO #link ([ID],[MATCH RANK]) VALUES('aaaa',NULL) 
INSERT INTO #link ([ID],[MATCH RANK]) VALUES(LEFT(NEWID(),4),NULL) 

CREATE TABLE #Matches ([ID] VARCHAR(4),[MATCH RANK] INT) 
CREATE TABLE #Unmatched ([ID] VARCHAR(4),[MATCH RANK] INT) 

;WITH MatchedTbl AS 
(
    SELECT DISTINCT c.[ID], c.[MATCH RANK] 
    FROM #Cohort c 
    INNER JOIN #link h ON c.[ID] = h.[ID] 
) 
INSERT INTO #Matches 
SELECT c.[ID], c.[MATCH RANK] 
    FROM MatchedTbl c 

;WITH NonMatchedTbl AS 
(
    SELECT DISTINCT l.[ID], l.[MATCH RANK] 
     FROM #link l 
    WHERE l.ID NOT IN (SELECT DISTINCT ID FROM #cohort) 
) 
INSERT INTO #Unmatched 
SELECT [ID], [MATCH RANK] 
    FROM NonMatchedTbl 

SELECT * FROM #Cohort 
SELECT * FROM #Link 
SELECT * FROM #Matches 
SELECT * FROM #Unmatched 

DROP TABLE #Cohort 
DROP TABLE #link 
DROP TABLE #Matches 
DROP TABLE #Unmatched 
0

查找匹配/不匹配记录的标准方法是执行左连接并在左连接表中查找空值。

SELECT t1.id, COUNT(t2.name) AS rank 
INTO #MatchedTable 
FROM Table1 t1 
LEFT JOIN Table2 t2 ON t2.name = t1.name 
WHERE t2.name IS NOT NULL 
GROUP BY t1.id 
ORDER BY t1.id 

和:

SELECT t1.id, NULL AS rank 
INTO #UnmatchedTable 
FROM Table1 t1 
LEFT JOIN Table2 t2 ON t2.name = t1.name 
WHERE t2.name IS NULL 
GROUP BY t1.id 
ORDER BY t1.id 

我希望这有助于。

0

如果您处理MASSIVE数据,您可以尝试两件事情。如果您仍想使用合并语句,则可以尝试在BATCHES中执行此操作,而不是一次执行所有操作。或者你可以分配批次并直接插入。无论哪种方式,我会建议一个可能的暂存区域,创建一个索引,然后插入。分配具有ntile功能的批次。低于该自解压例子在SQL Server 2008或更高版本上运行:

declare @Person Table (personID int identity, person varchar(8)); 

insert into @Person values ('Brett'),('Sean'),('Chad'),('Michael'),('Ray'),('Erik'),('Quyen'),('John'),('Tim'); 

declare @Orders table (OrderID int identity, PersonID int, Desciption varchar(32), Amount int); 

insert into @Orders values (1, 'Shirt', 20),(1, 'Shoes', 50),(2, 'Shirt', 22),(2, 'Shoes', 52),(3, 'Shirt', 20),(3, 'Shoes', 50),(3, 'Hat', 20),(4, 'Shirt', 20),(5, 'Shirt', 20),(5, 'Pants', 30), 
(6, 'Shirt', 20),(6, 'RunningShoes', 70),(7, 'Shirt', 22),(7, 'Shoes', 40),(7, 'Coat', 80) 

declare @Storage table (batch int, personid int, person varchar(8), orderid int, Desciption varchar(32), amount int); 

insert into @Storage 

Select 
    ntile(5) over(order by p.PersonID) 
-- ntile does the number n inside across entire dataset so if I had 500 items 100 would each be different batch 
, p.personID 
, p.person 
, o.OrderID 
, o.Desciption 
, o.Amount 
from @Person p 
    left join @Orders o on p.personID = o.PersonID 
-- left join assures that when orders do not exist I still get the person 

declare @Cursor int = 5 
-- I can set a cursor for inserts based on batching. 

-- pretend tables for matching 
declare @Matched table (personid int, person varchar(8), orderid int, Desciption varchar(32), amount int); 
declare @UnMatched table (personid int, person varchar(8), orderid int, description varchar(32), amount int); 


insert into @Matched 
select 
    personID 
, person 
, OrderID 
, Desciption 
, Amount 
from @Storage 
where batch = @Cursor 
and orderID is not null 


insert into @UnMatched 
select 
    personID 
, person 
, OrderID 
, Desciption 
, Amount 
from @Storage 
where batch = @Cursor 
and orderID is null 

select * From @Matched 
select * From @UnMatched 

我的例子很简单,但你可以改变“光标”变量看到,将从临时出现不同的结果。由于批处理,我不会一次运行整个集合,而且我可以将数据存储在存储器中,然后编写一个过程,根据将改变的游标或整数执行插入操作。无论数据是否被处理,您甚至可以添加一个用于位参考的列。