Norman Khine wrote:

> Here is the latest version http://pastie.org/1066582 can this be
> further improved?

> # get all the duplicates and clean the products table
> main.execute("SELECT product_Id, url FROM %s.product WHERE url != ''" % 
db)
> results = main.fetchall()
> 
> d = defaultdict(set)
> for id, url in results:
>       d[url].add(id)
> 
> for ids in d.itervalues():
>       if len(ids) > 1:
>               # we now hove the first product_id added
>               canonical = min(ids)
>               ids = list(ids)
>               ids.pop(ids.index(canonical))
>               for id in ids:
>                       update_product_id = 'UPDATE 
> oneproduct.productList_product_assoc SET 
productList_id=%s WHERE productList_id=%s'
>                       main.execute(update_product_id, (id, canonical))
>                       org.commit()
> main.close()

Yes; do it in SQL. Here's my attempt:

# Disclaimer: I stopped at the first point where it seemed to work; don't 
# apply the following on valuable data without extensive prior tests.
 
import sqlite3 
WIDTH = 80
db = sqlite3.connect(":memory:")

url_table = [
(24715,"http://aqoon.local/muesli/2-muesli-tropical-500g.html";),
(24719,"http://aqoon.local/muesli/2-muesli-tropical-500g.html";),
(24720,"http://example.com/index.html";)
]
cursor = db.cursor()
cursor.execute("create table alpha (id, url)")
cursor.executemany("insert into alpha values (?, ?)", url_table)

c2 = db.cursor()

id_table = [
(1, 24715),
(2, 24719),
(3, 24720)
]

cursor.execute("create table beta (id, alpha_id)")
cursor.executemany("insert into beta values (?, ?)", id_table)

def show(name):
    print name.center(WIDTH, "-")
    for row in cursor.execute("select * from %s" % name):
        print row
    print

print " BEFORE ".center(WIDTH, "=")
show("alpha")
show("beta")

cursor.execute("""
create view gamma as 
select min(a.id) new_id, b.id old_id from alpha a, alpha b 
where a.url = b.url group by a.url
""")
cursor.execute("""
update beta 
set alpha_id = (select new_id from gamma where alpha_id = old_id) 
where (select new_id from gamma where alpha_id = old_id) is not Null
""")

cursor.execute("""
delete from alpha 
where id not in (select min(b.id) 
from alpha b where alpha.url = b.url)
""")

print " AFTER ".center(WIDTH, "=")
show("alpha")
show("beta")

A database expert could probably simplify that a bit.

Again: duplicate records are best not created rather than removed. If you 
can create a unique index for the url column and alter your insertion code 
appropriately.

Peter

_______________________________________________
Tutor maillist  -  Tutor@python.org
To unsubscribe or change subscription options:
http://mail.python.org/mailman/listinfo/tutor

Reply via email to