2011-01-18 160 views
2

我试图准备一些数据以供第三方删除,不幸的是,它们只能处理2000批记录中的数据。我有10万条记录,可能需要多次分割和导出这些数据,所以我想以某种方式自动化这个过程。将SQL Server 2008查询分解为批处理

有没有一种合理的简单方法来使用SQL Server 2008来做到这一点?我没有运行一个复杂的查询 - 它与SELECT PKID FROM Sometable ORDER BY PKID没有太大关系 - 尽管我可以使用游标来完成此操作,但我想知道是否有更好的方法。

回答

4
SET NOCOUNT ON; 

CREATE TABLE [dbo].[SyncAudit] (PkId INT, BatchNumber INT) 

DECLARE @batchsize INT 
    ,@rowcount INT 
    ,@batchcount INT 
    ,@rootdir VARCHAR(2048) 
    ,@saveas VARCHAR(2048) 
    ,@query VARCHAR(2048) 
    ,@bcpquery VARCHAR(2048) 
    ,@bcpconn VARCHAR(64) 
    ,@bcpdelim VARCHAR(2) 

SET  @rootdir = '\\SERVER1\SHARE1\FOLDER\' 
SET  @batchsize = 2000 
SET  @bcpdelim = '|' 
SET  @bcpconn = '-T' -- Trusted 
--SET  @bcpconn = '-U <username> -P <password>' -- SQL authentication 

SELECT @rowcount = COUNT(1), 
    @batchcount = CEILING(COUNT(1)/@batchsize) FROM <@TableName, string, 'SomeTable'> 

SELECT [BatchSize] = @BatchSize, [BatchCount] = @Batchcount 

INSERT INTO SyncAudit 
SELECT 
<@TableKey, string, 'PKField'> 
,groupnum = NTILE(@batchcount) OVER (ORDER BY <@TableKey, string, 'PKField'>) 
FROM 
<@TableName, string, 'SomeTable'> 

WHILE (@batchcount > 0) 
BEGIN 

SET @saveas = @rootdir + 'batchnumber-' + cast(@batchcount as varchar) + '.txt' 
SET @query = ' SELECT [<@TableName, string, 'SomeTable'>].* 
       FROM [' + db_name() + '].[dbo].[<@TableName, string, 'SomeTable'>] 
       JOIN [' + db_name() + '].[dbo].[SyncAudit] 
          ON [<@TableName, string, 'SomeTable'>].<@TableKey, string, 'PKField'> = [SyncAudit].PkId 
          AND [SyncAudit].BatchNumber = ' + cast(@batchcount as varchar) + '' 

SET @bcpquery = 'bcp "' + replace(@query, char(10), '') + '" QUERYOUT "' + @saveas + '" -c -t^' + @bcpdelim + ' ' + @bcpconn + ' -S ' + @@servername 
EXEC master..xp_cmdshell @bcpquery 

--EXEC (@query) 


SET @batchcount = @batchcount -1 
END 


DROP TABLE [dbo].[SyncAudit] -- or leave for reference 
+0

非常有趣 - 它直接导出到文件的事实特别好。我还没有机会试用它,但今天下午我会尝试一下。 tyvm! – 2011-01-19 15:26:44

4

我认为你可以利用ROW_NUMBER,然后使用BETWEEN来指定你喜欢的一系列行。或者,如果您知道没有间隙,或者不在乎间隙,则可以使用PKID。

SELECT ... 
FROM 
    (SELECT ... 
     ROW_NUMBER() OVER(ORDER BY PKID) as RowNum 
    FROM Sometable e 
    ) t 
WHERE RowNum BETWEEN @startRowIndex AND (@startRowIndex + @maximumRows) - 1 

这通常用于分页结果。 4GuysFromRolla有很好的article on it

+0

是的,这是我打算带头的方向,因为没有更优雅的解决方案。但是,我没有太多与ROW_NUMBER()一起工作的经验,因此您的示例仍然非常有用。 :) – 2011-01-18 22:28:50

+1

row_number()会逐渐变得越来越慢,它的范围越来越远? Esp接近100k记录的末尾 – RichardTheKiwi 2011-01-19 00:22:16

2

你可以制定出在一段时间@@ ROWCOUNT循环范围的目标所需的行。它可能比ROW_NUMBER()更好,它必须从头开始保持编号。

declare @startid int 
declare @endid int 

-- get one range, these are efficient as they go over the PKID key by range 
select top(1) @startid = pkid from sometable order by pkid -- 1 key visited 
select top(2000) @endid = pkid from sometable order by pkid -- 2000 keys visited 
-- note: top 2000 may end up with the 514th id if that is the last one 

while @@ROWCOUNT > 0 
begin 
    insert otherdb.dbo.backupcopy 
    select * from sometable 
    where pkid between @startid and @endid 

    select top(1) @startid = pkid from sometable 
    WHERE pkid > @endid -- binary locate 
    order by pkid 

    select top(2000) @endid = pkid from sometable 
    WHERE pkid > @endid -- binary locate, then forward range lookup, max 2000 keys 
    order by pkid 
end 
0

我结束了使用cyberkiwi亚当提供的方法的组合。我不需要使用ROW_NUMBER,因为我在table数据类型中使用了IDENTITY列。

下面是我使用的编码的编辑版本 - 它像一个魅力工作。再次感谢所有人的帮助!

use Testing 
GO 

SET NOCOUNT ON 

declare 
    @now datetime = GETDATE(), 
    @batchsize int = 2000, 
    @bcpTargetDir varchar(500) = '\\SomeServer\Upload\', 
    @csvQueryServer varchar(500) = '.\SQLExpress', 

    @rowcount integer, 
    @nowstring varchar(100), 
    @batch_id int, 
    @startid int, 
    @endid int, 
    @oidCSV varchar(max), 
    @csvQuery varchar(max), 
    @bcpFilename varchar(200), 
    @bcpQuery varchar(1000) 

declare @tblBatchRanges table (
    batch_id integer NOT NULL IDENTITY(1,1) PRIMARY KEY, 
    oid_start integer NOT NULL, 
    oid_end integer NOT NULL, 
    csvQuery varchar(max) 
) 

-- Create a unique timestamp-based string, which will be used to name the exported files. 
select @nowstring = CONVERT(varchar, @now, 112) + '-' + REPLACE(CONVERT(varchar, @now, 114), ':', '') 


-- 
select top(1) @startid = oid from Testing..MyObjectIds order by oid 
select top(@batchsize) @endid = oid from Testing..MyObjectIds order by oid 
select @rowcount = @@ROWCOUNT 

while (@rowcount > 0) begin 
    -- Create a CSV of all object IDs in the batch, using the STUFF() function (http://goo.gl/EyE8L). 
    select @csvQuery = 'select stuff((select distinct '','' + CAST(oid as varchar) from Testing..MyObjectIds where oid between ' + CAST(@startid as varchar) + ' and ' + CAST(@endid as varchar) + ' order by '','' + CAST(oid as varchar) for xml path('''')),1,1,'''')' 


    -- Log the info and get the batch ID. 
    insert into @tblBatchRanges (oid_start, oid_end, csvQuery) 
     values (@startid, @endid, @oidCSV, @csvQuery) 

    select @batch_id = @@IDENTITY 


    -- Advance @startid and @endid so that they point to the next batch 
    select top(1) @startid = oid 
     from Testing..MyObjectIds 
     where oid > @endid 
     order by oid 

    select top(@batchsize) @endid = oid 
     from Testing..MyObjectIds 
     where oid > @endid 
     order by oid 

    select @rowcount = @@ROWCOUNT 


    -- Export the current batch to a file. 
    select @bcpFilename = 'MyExport-' + @nowstring + '-' + cast(@batch_id as varchar) + '.txt' 
    select @bcpQuery = 'bcp "' + @csvQuery + '" QUERYOUT "' + @bcpTargetDir + @bcpFilename + '" -S ' + @csvQueryServer + ' -T -c' 
    exec master..xp_cmdshell @bcpquery 
end 

SET NOCOUNT OFF 


--Check all of the logged info. 
select oid_start, oid_end, csvQuery from @tblBatchRanges