Use tempdb
/* Interview
Question.
How
to identify and remove duplicates.
Imagine
this is a very large (billion+ rows) table with
foreign
keys. Therefore it is not acceptable to
delete
and
then re-add a set of distinct rows.
Assignment: Remove duplicate rows. For example, if there are
4
rows of Daffy Duck, need to remove 3 rows.
*/
-- First let's create a sample Employee table
containing dups.
-- This table is intentionally made
without a primary key
-- for a greater challenge.
-- drop table Employee
CREATE TABLE dbo.Employee (
FirstName varchar(20),
LastName varchar(20)
)
INSERT INTO dbo.Employee VALUES ('Mickey', 'Mouse')
INSERT INTO dbo.Employee VALUES ('Donald', 'Duck')
INSERT INTO dbo.Employee VALUES ('Daffy', 'Duck')
INSERT INTO dbo.Employee VALUES ('Daffy', 'Duck')
INSERT INTO dbo.Employee VALUES ('Minnie', 'Mouse')
INSERT INTO dbo.Employee VALUES ('Daffy', 'Duck')
INSERT INTO dbo.Employee VALUES ('Beaver', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Daffy', 'Duck')
INSERT INTO dbo.Employee VALUES ('Huey', 'Duck')
INSERT INTO dbo.Employee VALUES ('Beaver', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Beaver', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Ward', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Ward', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Ward', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Ward', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('June', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('June', 'Cleaver')
-- Examine table with dups. 17 rows.
SELECT *
FROM Employee
ORDER BY 2, 1
-- After we have removed dups, should
look like this, 8 rows.
SELECT DISTINCT *
FROM Employee
ORDER BY 2, 1
-- Identify dups and put into temp
table
-- DROP TABLE #Dups
SELECT identity (int, 1, 1) ID,
FirstName, LastName, Count(*) Qty
INTO #Dups
FROM Employee
GROUP BY FirstName, LastName
HAVING Count(*) > 1
ORDER BY 2, 1
-- If the #Dups table contains more
than a thousand rows, might be worthwile to make this index
CREATE UNIQUE INDEX idx_Dups ON #Dups(ID) INCLUDE (FirstName, LastName)
-- Examine the dups
SELECT *
FROM #Dups
-- Remove the dups
DECLARE @FirstName varchar(20),
@LastName varchar(20),
@ID int,
@MaxID int,
@NumToDelete int
SELECT @MaxID = Max(ID),
@ID = 0
FROM #Dups
WHILE @ID < @MaxID
BEGIN
-- Get next ID
SELECT @ID = Min(ID)
FROM #Dups
WHERE ID > @ID
-- Get other attributes for this ID
SELECT @FirstName = FirstName,
@LastName = LastName,
@NumToDelete = Qty - 1
FROM #Dups
WHERE ID = @ID
DELETE TOP (@NumToDelete)
FROM Employee
WHERE FirstName = @FirstName
AND LastName = @LastName
END
-- Examine to verify dups have been
removed
SELECT *
FROM Employee
No comments:
Post a Comment