About Me

Colorado
Paul has 18 years experience with Microsoft SQL Server. He has worked in the roles of production DBA, database developer, database architect, applications developer, business intelligence and data warehouse developer, and instructor for students aspiring for MCDBA certification. He has performed numerous data migrations and supported large databases (3 Terabyte, 1+ billion rows) with high transactions. He is a member of PASS, blogs about lessons learned from a developer’s approach to SQL Server administration, and has been the president of the Boulder SQL Server Users’ Group for 11 years, from January 2009 to 2020.

Friday, April 10, 2015

Interview Question - How to remove duplicates in a large table with no primary key

Use tempdb

/*    Interview Question.

      How to identify and remove duplicates.
      Imagine this is a very large (billion+ rows) table with
            foreign keys.  Therefore it is not acceptable to delete
            and then re-add a set of distinct rows.

      Assignment:  Remove duplicate rows.  For example, if there are
            4 rows of Daffy Duck, need to remove 3 rows.

*/

-- First let's create a sample Employee table containing dups.
-- This table is intentionally made without a primary key
-- for a greater challenge.

-- drop table Employee
CREATE TABLE dbo.Employee (
      FirstName varchar(20),
      LastName varchar(20)
      )

INSERT INTO dbo.Employee VALUES ('Mickey', 'Mouse')
INSERT INTO dbo.Employee VALUES ('Donald', 'Duck')
INSERT INTO dbo.Employee VALUES ('Daffy', 'Duck')
INSERT INTO dbo.Employee VALUES ('Daffy', 'Duck')
INSERT INTO dbo.Employee VALUES ('Minnie', 'Mouse')
INSERT INTO dbo.Employee VALUES ('Daffy', 'Duck')
INSERT INTO dbo.Employee VALUES ('Beaver', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Daffy', 'Duck')
INSERT INTO dbo.Employee VALUES ('Huey', 'Duck')
INSERT INTO dbo.Employee VALUES ('Beaver', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Beaver', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Ward', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Ward', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Ward', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('Ward', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('June', 'Cleaver')
INSERT INTO dbo.Employee VALUES ('June', 'Cleaver')


-- Examine table with dups.  17 rows.
SELECT *
FROM Employee
ORDER BY 2, 1

-- After we have removed dups, should look like this, 8 rows.
SELECT DISTINCT *
FROM Employee
ORDER BY 2, 1

-- Identify dups and put into temp table
-- DROP TABLE #Dups
SELECT  identity (int, 1, 1) ID,
            FirstName, LastName, Count(*) Qty
INTO #Dups
FROM Employee
GROUP BY FirstName, LastName
HAVING Count(*) > 1
ORDER BY 2, 1


-- If the #Dups table contains more than a thousand rows, might be worthwile to make this index
CREATE UNIQUE INDEX idx_Dups ON #Dups(ID) INCLUDE (FirstName, LastName)


-- Examine the dups
SELECT *
FROM #Dups



-- Remove the dups
DECLARE @FirstName varchar(20),
            @LastName varchar(20),
            @ID int,
            @MaxID int,
            @NumToDelete int

SELECT  @MaxID = Max(ID),
            @ID = 0
FROM #Dups


WHILE @ID < @MaxID
      BEGIN
            -- Get next ID
            SELECT @ID = Min(ID)
            FROM #Dups
            WHERE ID > @ID

            -- Get other attributes for this ID
            SELECT  @FirstName = FirstName,
                        @LastName = LastName,
                        @NumToDelete = Qty - 1
            FROM #Dups
            WHERE ID = @ID


            -- Remove rows 
            DELETE TOP (@NumToDelete)
            FROM Employee
            WHERE FirstName = @FirstName
              AND LastName = @LastName


      END


-- Examine to verify dups have been removed
SELECT *
FROM Employee




No comments:

Post a Comment