I'll have a go at it. Try this:
Select
Top 1000 *
From (
SELECT *
FROM addresses ab1
INNER JOIN (
SELECT LastName,ZipCode
FROM Addresses
WHERE LastName > Space(100)
AND Zipcode <> ''
GROUP BY LastName,ZipCode
HAVING COUNT(*)>1
) ab2
ON ab1.LastName=ab2.LastName
AND ab1.ZipCode =ab2.ZipCode
UNION ALL
SELECT *
FROM addresses ab1
INNER JOIN (
SELECT LastName
FROM Addresses
WHERE LastName > Space(100)
HAVING COUNT(*)>1
GROUP BY LastName
AND MIN(ZipCode)=''
) ab2
ON ab1.LastName=ab2.LastName
AND ab1.ZipCode =''
UNION ALL
SELECT *
FROM addresses ab1
INNER JOIN (
SELECT Company,ZipCode
FROM Addresses
WHERE Company > Space(100)
AND Zipcode <> ''
GROUP BY Company,ZipCode
HAVING COUNT(*)>1
AND MIN(LastName) < MAX(LastName)
) ab2
ON ab1.Company=ab2.Company
AND ab1.ZipCode=ab2.ZipCode
UNION ALL
SELECT *
FROM addresses ab1
INNER JOIN (
SELECT Company
FROM Addresses
WHERE Company > Space(100)
GROUP BY Company
HAVING COUNT(*)>1
AND MIN(LastName) < MAX(LastName)
AND MIN(ZipCode)=''
) ab2
ON ab1.Company=ab2.Company
AND ab1.ZipCode=''
) X
Order By
LastName, FirstName
Note that the predicate "AND MIN(LastName) < MAX(LastName)" tries to
eliminate duplicate duplicates. However, this may result in a missed
Company duplicate, because of existing LastName duplicates for the same
ZipCode.
Of course, if you are using TOP 1000 to just get the first 1000
duplicates (and not all duplicates), then you can also do something like
this:
Declare @count int
Set @count=0
SELECT TOP 1000 *
FROM addresses ab1
INNER JOIN (
SELECT LastName,ZipCode
FROM Addresses
WHERE LastName > Space(100)
AND Zipcode <> ''
GROUP BY LastName,ZipCode
HAVING COUNT(*)>1
) ab2
ON ab1.LastName=ab2.LastName
AND ab1.ZipCode =ab2.ZipCode
ORDER BY LastName, FirstName
Set @Count=@Count+@@rowcount
If @Count < 1000
Begin
SET ROWCOUNT 1000-@Count
SELECT TOP 1000 *
FROM addresses ab1
INNER JOIN (
SELECT LastName
FROM Addresses
WHERE LastName > Space(100)
HAVING COUNT(*)>1
GROUP BY LastName
AND MIN(ZipCode)=''
) ab2
ON ab1.LastName=ab2.LastName
AND ab1.ZipCode =''
ORDER BY LastName, FirstName
Set @Count=@Count+@@rowcount
End
If @Count < 1000
Begin
SET ROWCOUNT 1000-@Count
SELECT TOP 1000 *
FROM addresses ab1
INNER JOIN (
SELECT Company,ZipCode
FROM Addresses
WHERE Company > Space(100)
AND Zipcode <> ''
GROUP BY Company,ZipCode
HAVING COUNT(*)>1
AND MIN(LastName) < MAX(LastName)
) ab2
ON ab1.Company=ab2.Company
AND ab1.ZipCode=ab2.ZipCode
ORDER BY LastName, FirstName
Set @Count=@Count+@@rowcount
End
If @Count < 1000
Begin
SET ROWCOUNT 1000-@Count
SELECT TOP 1000 *
FROM addresses ab1
INNER JOIN (
SELECT Company
FROM Addresses
WHERE Company > Space(100)
GROUP BY Company
HAVING COUNT(*)>1
AND MIN(LastName) < MAX(LastName)
AND MIN(ZipCode)=''
) ab2
ON ab1.Company=ab2.Company
AND ab1.ZipCode=''
ORDER BY LastName, FirstName
End
SET ROWCOUNT 0
Other notes:
- The predicate "ab2.Ad_Id != ab1.Ad_Id" uses proprietary syntax. The
ANSI-SQL syntax is "ab2.Ad_Id <> ab1.Ad_Id"
- If you are comparing with an empty string, then it is useless to
perform two Trim functions. So you can simplify
"Ltrim(RTrim(ab1.Company)) != ''" to "RTrim(ab1.Company) <> ''". In the
query above, it is translated to "ab1.Company > Space(100)", because
this makes it a usuable search argument for the optimizer
Hope this helps,
Gert-Jan
Sebastian wrote:
The following query needs about 2 minutes to complete (finding dupes)
on a table of about 10000 addresses. Does anyone have an idea on how
to speed this up ?
Thanks in advance !!!
Sebastian
Select
Top 1000 *
From
addresses ab1
Where
(
Select Count(*) From addresses base ab2 Where
(
(
(ab2.LastName = ab1.LastName And Ltrim(RTrim(ab1.LastName)) != '' )
Or
(ab2.Company = ab1.Company And (Ltrim(RTrim(ab1.Company)) != '') )
)
And
(
ab2.ZipCode = ab1.ZipCode
Or
ab1.ZipCode = ''
)
)
And ab2.Ad_Id != ab1.Ad_Id
) >= 1
Order By
LastName, FirstName