My version, not much tested. It probably doesn't work well for tables
with few rows. It finds the most frequent word beginnings, and then
splits the data according to them.
data = """\
44544 ipod apple black 102
GFGFHHF-12 unknown thing bizar brick mortar tbc
45fjk do not know + is less biac
disk seagate 250GB 130
5G_gff tbd tbd
gjgh88hgg media record a and b 12
hjj foo bar hop zip
hg uy oi hj uuu ii a qqq ccc v ZZZ Ughj
qdsd zert nope nope
"""
import re, pprint
# import collections # For Python 2.5
# RE to find the beginning of words
tpatt = re.compile(r"\b[^ ]")
# Remove empty lines
lines = filter(None, data.splitlines())
# Find the positions of all word beginnings
# This finds: treshs = [0, 11, 25, 35, 49, ...
# 44544 ipod apple black 102
# ^ ^ ^ ^ ^
treshs = [ob.start() for li in lines for ob in tpatt.finditer(li)]
# Find treshs frequences
freqs = {}
for el in treshs:
freqs[el] = freqs.get(el, 0) + 1
# Find treshs frequences, alternative for Python V.2.5
# freqs = collections.defaultdict(int)
# for el in treshs:
# freqs[el] += 1
# Find a big enough frequence
bigf = max(freqs.itervalues()) * 0.6
# Find the most common column beginnings
cols = sorted(k for k,v in freqs.iteritems() if v>bigf)
def xpairs(alist):
"xpairs(xrange(n)) ==> (0,1), (1,2), (2,3), ..., (n-2, n-1)"
for i in xrange(len(alist)-1):
yield alist[i:i+2]
result = [[li[x:y].strip() for x,y in xpairs(cols+[None])] for li in
lines]
print data
pprint.pprint(result)
"""
Output:
44544 ipod apple black 102
GFGFHHF-12 unknown thing bizar brick mortar tbc
45fjk do not know + is less biac
disk seagate 250GB 130
5G_gff tbd tbd
gjgh88hgg media record a and b 12
hjj foo bar hop zip
hg uy oi hj uuu ii a qqq ccc v ZZZ Ughj
qdsd zert nope nope
[['44544', 'ipod', 'apple', 'black', '102'],
['GFGFHHF-12', 'unknown thing', 'bizar', 'brick mortar', 'tbc'],
['45fjk', 'do not know', '+ is less', '', 'biac'],
['', 'disk', 'seagate', '250GB', '130'],
['5G_gff', '', 'tbd', 'tbd', ''],
['gjgh88hgg', 'media record', 'a and b', '', '12'],
['hjj', 'foo', 'bar', 'hop', 'zip'],
['hg uy oi', 'hj uuu ii a', 'qqq ccc v', 'ZZZ Ughj', ''],
['qdsd', 'zert', '', 'nope', 'nope']]
"""
Bye,
bearophile
--
http://mail.python.org/mailman/listinfo/python-list