Clustering TabEL
[1]:
%%capture --no-display
import logging as log
log.getLogger().setLevel(log.DEBUG)
import takco
tables = takco.DaskHashBag.load(
f'hdfs://bricks07:9000/user/kruit/tabel/1-aa',
address = 'tcp://192.168.62.207:8786'
)
# tables = takco.HashBag.load('../../data/TabEL/75k-part/75k-part-aa')
tables.client if hasattr(tables, 'client') else None
[1]:
Client
|
Cluster
|
[2]:
%%time
assets = takco.config.parse('../../resources/config-wikidata.toml')
pipeline = takco.config.parse('../../resources/pipelines/TabEL.toml')
steps = takco.config.build('step', {**assets, **pipeline})
workdir = 'output/tabel-notebook'
tables = takco.TableSet.run(steps[:4], input_tables=tables, workdir=workdir).persist()
if hasattr(tables, 'bag'):
print(tables.bag.count().compute())
INFO:root:Running pipeline in output/tabel-notebook using <takco.util.HashBag object at 0x7f054b3375d0>
INFO:root:Chaining pipeline step 0-reshape
INFO:root:Restructuring with rules: [{'find': 'Precededby ', 'header': 'Preceded by'}, {'find': 'Succeededby ', 'header': 'Succeeded by'}]
INFO:root:Unpivoting with heuristics: NumSuffix, SeqPrefix, SpannedRepeat, AgentLikeHyperlink, AttributePrefixFinder
INFO:root:Chaining pipeline step 1-cluster
INFO:root:Chaining pipeline step 2-link
DEBUG:root:Lookup with <takco.link.sqlite.SQLiteLookup object at 0x7f04ac0a7290>
INFO:root:Chaining pipeline step 3-coltypes
371
CPU times: user 64.8 ms, sys: 11.9 ms, total: 76.7 ms
Wall time: 6.13 s
[ ]:
[3]:
%%time
step_config = steps[4]
step_config.pop('step')
clusters = takco.TableSet.cluster(
tables,
workdir=workdir,
**step_config,
).tables.persist()
INFO:root:Numbering tables...
INFO:numexpr.utils:Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
INFO:root:Dask offset tableIndex tableIndex 1
INFO:root:Dask offset numCols columnIndexOffset 0
INFO:root:Building matchers: headjacc, headvec, bodylsh, bodyvec, bodytype
INFO:root:Indexing headjacc
DEBUG:root:Serializing <takco.cluster.matchers.celljacc.CellJaccMatcher object at 0x7f838c3a35d0> to output/tabel-notebook/headjacc
INFO:root:Indexing headvec
DEBUG:root:faiss info: analyzing 1555 vectors of size 50
no NaN or Infs in data
761 vectors are distinct (48.94%)
vector 28 has 59 copies
range of L2 norms=[1, 1] (0 null vectors)
vectors are normalized, inner product and L2 search are equivalent
matrix contains no 0s
no constant dimensions
no dimension has a too large mean
stddevs per dimension are in [0.0822988 0.199019]
DEBUG:root:Serializing <takco.cluster.matchers.embedding.EmbeddingMatcher object at 0x7f838c1f24d0> to output/tabel-notebook/headvec
INFO:root:Indexing bodylsh
Indexing bodylsh: 100%|██████████| 1910/1910 [00:00<00:00, 5115.95it/s]
DEBUG:root:Serializing <takco.cluster.matchers.lsh.LSHMatcher object at 0x7f8302e14e50> to output/tabel-notebook/bodylsh
INFO:root:Indexing bodyvec
DEBUG:root:faiss info: analyzing 1751 vectors of size 50
no NaN or Infs in data
1677 vectors are distinct (95.77%)
vector 895 has 7 copies
range of L2 norms=[1, 1] (0 null vectors)
vectors are normalized, inner product and L2 search are equivalent
matrix contains no 0s
no constant dimensions
no dimension has a too large mean
stddevs per dimension are in [0.0778333 0.257962]
DEBUG:root:Serializing <takco.cluster.matchers.embedding.EmbeddingMatcher object at 0x7f8302d0ea90> to output/tabel-notebook/bodyvec
INFO:root:Indexing bodytype
DEBUG:root:TypeCos index is len 383
DEBUG:root:Serializing <takco.cluster.matchers.typecos.TypeCosMatcher object at 0x7f82ef9e01d0> to output/tabel-notebook/bodytype
INFO:root:Indexing section
DEBUG:root:Serializing <takco.cluster.matchers.celljacc.CellJaccMatcher object at 0x7f8317558ed0> to output/tabel-notebook/section
INFO:root:Blocking tables; computing and aggregating column sims...
INFO:root:Got 391 table similarities; 375x reduction
INFO:root:Created graph IGRAPH U-W- 383 391 -- + attr: weight (e)
INFO:root:Found 3/375 >1 partitions
INFO:root:Clustering columns...
INFO:root:Merging clustered tables...
CPU times: user 1.99 s, sys: 926 ms, total: 2.92 s
Wall time: 16.2 s
[4]:
nontrivial_clusters = [t for t in clusters if t.get("partColAligns")]
nontrivial_clusters = sorted(nontrivial_clusters, key=lambda t: -len(t.get("partColAligns")))
[5]:
t = nontrivial_clusters[0]
print(f'Table {t["_id"]} was created from {len(t["partColAligns"])} tables')
print('Result:')
display(takco.preview(t))
print('Original:')
display(takco.preview(t["partColAligns"], ntables=None))
Table part-0 was created from 6 tables
Result:
? | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Preceded by | Preceded by | 3P% | FT% | Succeeded by | FG% | Year | Team | Role | Notes | GP | GS | MPG | RPG | APG | SPG | BPG | PPG | |
Manius Acilius Glabrio Gnaeus Cornelius Severus , Marcus Valerius Homullus | Gaius Bruttius Praesens | Political offices | Consul of the Roman Empire 153 with Aulus Junius Rufinus | Lucius Verus | Titus Sextius Lateranus | |||||||||||||
Commodus , Publius Martius Veru | Gaius Bruttius Praesens | Political offices | Consul of the Roman Empire 180 with Sextus Quintilius Condianus | Commodus | Lucius Antistius Burrus | |||||||||||||
Samuel Rodgers | Tom Boyd (politician) | Parliament of Northern Ireland | Member of Parliament for Belfast Pottinger 1958 - 1969 | Joshua Cardwell | ||||||||||||||
None | Tom Boyd (politician) | Political offices | Leader of the Northern Ireland Labour Party at Stormont 1958 - 1969 | Vivian Simpson | ||||||||||||||
Anna Brolly | Pat O'Rawe | Political offices | Mayor of Armagh 2003 - 04 | Eric Speers |
(342 more rows)
Original:
? | 0 | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|---|
∈ |
|
|
|
|
|
|
_pgTitle | Preceded by | Succeeded by | Succeeded by , | |||
Gaius Bruttius Praesens | Political offices | Manius Acilius Glabrio Gnaeus Cornelius Severus , Marcus Valerius Homullus | Consul of the Roman Empire 153 with Aulus Junius Rufinus | Lucius Verus | Titus Sextius Lateranus | |
Gaius Bruttius Praesens | Political offices | Commodus , Publius Martius Veru | Consul of the Roman Empire 180 with Sextus Quintilius Condianus | Commodus | Lucius Antistius Burrus |
? | 0 | 1 | 2 | 3 | 4 |
---|---|---|---|---|---|
∈ |
|
|
|
|
|
_pgTitle | Preceded by | Succeeded by | |||
Tom Boyd (politician) | Parliament of Northern Ireland | Samuel Rodgers | Member of Parliament for Belfast Pottinger 1958 - 1969 | Joshua Cardwell | |
Tom Boyd (politician) | Political offices | None | Leader of the Northern Ireland Labour Party at Stormont 1958 - 1969 | Vivian Simpson | |
Pat O'Rawe | Political offices | Anna Brolly | Mayor of Armagh 2003 - 04 | Eric Speers | |
Pat O'Rawe | Northern Ireland Assembly | John Fee | MLA for Newry and Armagh 2003 - 2007 | Cathal Boylan | |
Walther Dahl | Military offices | Major Gerhard Michalski | Commander of Jagdgeschwader z.b.V. 20 May 1944 – 6 June 1944 | Major Gerhard Schöpfel |
(109 more rows)
(26 more rows)
? | 0 | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|---|
∈ |
|
|
|
|
|
|
_pgTitle | No. | Year | Tournament | Opponent | Result | |
Meaghan Francella | 1 | 2007 | MasterCard Classic | Annika Sörenstam | Won with birdie on fourth extra hole |
? | 0 | 1 | 2 | 3 | 4 |
---|---|---|---|---|---|
∈ |
|
|
|
|
|
_pgTitle | Year | Title | Role | Notes | |
Frances Bavier | 1952 | Racket Squad | Martha Carver | 1 episode | |
Frances Bavier | 1952– 1953 | Gruen Guild Playhouse | Sarah Cummings | 2 episodes | |
Frances Bavier | 1953 | Hallmark Hall of Fame | Lou Bloor | 1 episode | |
Frances Bavier | 1953– 1954 | City Detective | Various roles | 3 episodes | |
Frances Bavier | 1953– 1954 | Letter to Loretta | Various roles | 3 episodes |
(171 more rows)
? | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
∈ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_pgTitle | Year | Team | GP | GS | MPG | FG% | 3P% | FT% | RPG | APG | SPG | BPG | PPG | |
Eric Maynor | 2010 | Oklahoma City | 6 | 0 | 12.7 | .300 | .167 | .818 | 1.5 | 1.5 | .2 | .2 | 3.7 | |
Eric Maynor | 2011 | Oklahoma City | 17 | 0 | 12.9 | .377 | .360 | .789 | 1.3 | 2.2 | .5 | .0 | 4.8 | |
Eric Maynor | Career | 23 | 0 | 12.9 | .361 | .323 | .800 | 1.3 | 2.0 | .4 | .0 | 4.5 | ||
Mike Conley, Jr. | 2007–08 | Memphis | 53 | 46 | 26.1 | .428 | .330 | .732 | 2.6 | 4.2 | .8 | .0 | 9.4 | |
Mike Conley, Jr. | 2008–09 | Memphis | 82 | 61 | 30.6 | .442 | .406 | .817 | 3.4 | 4.3 | 1.1 | .1 | 10.9 |
(18 more rows)
All steps separately
[16]:
%%time
from takco.cluster.matchers import LSHMatcher, EmbeddingMatcher, CellJaccMatcher
from takco import cluster
fdir = '.'
matchers = [
CellJaccMatcher(fdir, name='headjacc', source='head'),
LSHMatcher(fdir, name='lsh', num_perm=64),
EmbeddingMatcher(fdir, name='emb', wordvec_fname='/export/scratch1/home/kruit/glove.6B.50d.pickle'),
CellJaccMatcher(fdir, name='sec', source='sectionTitle'),
]
tables = takco.TableSet.number_table_columns(tables).persist()
matchers = tables.pipe(cluster.matcher_add_tables, matchers)
matchers = list(matchers.fold(lambda x: x.name, lambda a, b: a.merge(b)))
for m in matchers:
print(m.name)
m.index()
INFO:root:Numbering tables...
DEBUG:root:Opening ../../data/TabEL/10k-part/10k-part-aa
DEBUG:root:Serial cumsum tableIndex tableIndex 1 -> 198
DEBUG:root:Serial cumsum numCols columnIndexOffset 0 -> 910
DEBUG:root:Piping matcher_add_tables ...
INFO:root:Loading word vectors /export/scratch1/home/kruit/glove.6B.50d.pickle
Loading tables into matchers: 198it [00:00, 242.43it/s]
Indexing lsh: 100%|██████████| 647/647 [00:00<00:00, 8045.81it/s]
DEBUG:root:faiss info: analyzing 605 vectors of size 50
no NaN or Infs in data
567 vectors are distinct (93.72%)
vector 2 has 12 copies
range of L2 norms=[1, 1] (0 null vectors)
vectors are normalized, inner product and L2 search are equivalent
matrix contains no 0s
no constant dimensions
no dimension has a too large mean
stddevs per dimension are in [0.0807233 0.25964]
headjacc
lsh
emb
sec
CPU times: user 2.53 s, sys: 321 ms, total: 2.85 s
Wall time: 2.85 s
[ ]:
# Look at a block
ti = 0
block = set()
matcher = matchers[1] # LSH matcher
print(matcher.name)
with matcher:
block |= set(matcher.block(ti, tableid_colids[ti]))
print(f'Got block of size {len(block)}:', block)
# First table is query, rest is block
takco.preview([i_table[ti]] + [i_table[b] for b in block if b in i_table])
[17]:
%%time
tableid_colids = dict(tables.pipe(cluster.get_table_ids))
print(len(tableid_colids))
DEBUG:root:Piping get_table_ids ...
198
CPU times: user 325 ms, sys: 702 µs, total: 325 ms
Wall time: 325 ms
[18]:
%%time
import pandas as pd
tablesim = pd.concat(tables.pipe(
cluster.get_tablesims,
matchers=matchers,
filter_matcher_names=['sec'],
agg_func='max',
agg_threshold=0.9,
align_columns='max1',
tableid_colids=tableid_colids,
))
tablesim
DEBUG:root:Piping get_tablesims ...
DEBUG:root:Loading <takco.cluster.matchers.lsh.LSHMatcher object at 0x7f2794631250> from disk...
INFO:root:Loading word vectors /export/scratch1/home/kruit/glove.6B.50d.pickle
DEBUG:root:Loading <takco.cluster.matchers.embedding.EmbeddingMatcher object at 0x7f2795fe5bd0> from disk...
DEBUG:root:Preparing block for matcher headjacc
DEBUG:root:Preparing block for matcher lsh
DEBUG:root:Preparing block for matcher emb
DEBUG:root:Querying emb faiss index with query matrix of shape (605, 50)
Blocking: 100%|██████████| 198/198 [00:00<00:00, 1822.86it/s]
DEBUG:root:Found 4847 pairs; 24 ± 23 per table
Looking up sec: 100%|██████████| 4847/4847 [00:00<00:00, 139013.77it/s]
DEBUG:root:Filtered down to 410 pairs
Looking up headjacc: 100%|██████████| 410/410 [00:00<00:00, 8841.24it/s]
Looking up lsh: 100%|██████████| 410/410 [00:00<00:00, 58388.72it/s]
DEBUG:root:Calculating 5262 lsh scores
Yielding lsh: 100%|██████████| 5262/5262 [00:00<00:00, 751308.13it/s]
Looking up emb: 100%|██████████| 410/410 [00:00<00:00, 57287.78it/s]
DEBUG:root:Calculating 4880 emb scores
Yielding emb: 100%|██████████| 4880/4880 [00:00<00:00, 842729.06it/s]
DEBUG:root:times: Timer(prepare_headjacc=4.3e-06, prepare_lsh=2.6e-06, prepare_emb=2.0e-01, block_headjacc=3.0e-03, block_lsh=9.6e-02, block_emb=1.2e-03, filter_sec=3.8e-02, match_headjacc=4.9e-02, match_lsh=2.5e-02, match_emb=2.2e-02)
DEBUG:root:Creating dataframe of column match scores
INFO:numexpr.utils:Note: NumExpr detected 32 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.
CPU times: user 4.31 s, sys: 663 ms, total: 4.97 s
Wall time: 1.7 s
[18]:
ti1 ti2
1 55 1.000000
67 1.000000
69 1.000000
78 1.000000
106 1.000000
...
189 22 0.919705
87 0.944925
168 1.000000
195 86 0.989103
133 0.941177
Length: 182, dtype: float64
[19]:
%%time
itups = ((ti,ti) for ti in tableid_colids)
ii = pd.MultiIndex.from_tuples(itups, names=['ti1', 'ti2'])
tablesim = pd.concat([tablesim, pd.Series(1, index=ii)])
CPU times: user 5.48 ms, sys: 91 µs, total: 5.57 ms
Wall time: 4.91 ms
[20]:
%%time
edge_exp = 5
louvain_partition = cluster.louvain(tablesim, edge_exp=edge_exp)
print(len(louvain_partition))
INFO:root:Created graph IGRAPH U-W- 198 380 -- + attr: weight (e)
163
CPU times: user 25.4 ms, sys: 4.88 ms, total: 30.3 ms
Wall time: 30 ms
[10]:
nonsingle = [p for p in louvain_partition if len(p) > 1]
len(nonsingle)
[10]:
15
[11]:
import logging as log
log.getLogger().setLevel(log.WARN)
chunks = tables.new(enumerate(nonsingle)).pipe(
cluster.cluster_partition_columns,
tableid_colids = tableid_colids,
matchers = matchers,
agg_func = 'max',
agg_threshold_col = 0.5,
)
from collections import ChainMap
ti_pi, pi_ncols, ci_pci, ti_colsim = (
{k: v for d in ds for k, v in d.items()} for ds in zip(*chunks)
)
len(ti_pi), len(pi_ncols), len(ci_pci)
DEBUG:root:Piping cluster_partition_columns ...
DEBUG:root:Loading <takco.cluster.matchers.lsh.LSHMatcher object at 0x7f988fc97810> from disk...
INFO:root:Loading word vectors /export/scratch1/home/kruit/glove.6B.50d.pickle
DEBUG:root:Loading <takco.cluster.matchers.embedding.EmbeddingMatcher object at 0x7f988fc97650> from disk...
Matching with headjacc: 0%| | 0/78 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/78 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 78/78 [00:00<00:00, 7528.61it/s]
Looking up headjacc: 100%|██████████| 78/78 [00:00<00:00, 8679.02it/s]
Matching with lsh: 0%| | 0/78 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/78 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 78/78 [00:00<00:00, 12591.63it/s]
Looking up lsh: 100%|██████████| 78/78 [00:00<00:00, 16795.30it/s]
DEBUG:root:Calculating 489 lsh scores
Yielding lsh: 100%|██████████| 489/489 [00:00<00:00, 674454.01it/s]
Matching with emb: 0%| | 0/78 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/78 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 78/78 [00:00<00:00, 9558.97it/s]
Looking up emb: 100%|██████████| 78/78 [00:00<00:00, 11990.75it/s]
DEBUG:root:Calculating 489 emb scores
Yielding emb: 100%|██████████| 489/489 [00:00<00:00, 657587.26it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (36, 36) column similarities
DEBUG:root:Partition 0 has 12 tables and 3 column clusters
Matching with headjacc: 0%| | 0/6 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/6 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 6/6 [00:00<00:00, 655.34it/s]
Looking up headjacc: 100%|██████████| 6/6 [00:00<00:00, 965.80it/s]
Matching with lsh: 0%| | 0/6 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/6 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 6/6 [00:00<00:00, 1071.62it/s]
Looking up lsh: 100%|██████████| 6/6 [00:00<00:00, 794.00it/s]
DEBUG:root:Calculating 24 lsh scores
Yielding lsh: 100%|██████████| 24/24 [00:00<00:00, 7031.52it/s]
Matching with emb: 0%| | 0/6 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/6 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 6/6 [00:00<00:00, 1099.14it/s]
Looking up emb: 100%|██████████| 6/6 [00:00<00:00, 1614.54it/s]
DEBUG:root:Calculating 6 emb scores
Yielding emb: 100%|██████████| 6/6 [00:00<00:00, 22753.91it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (33, 33) column similarities
DEBUG:root:Partition 1 has 3 tables and 11 column clusters
Matching with headjacc: 0%| | 0/10 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/10 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 10/10 [00:00<00:00, 1744.72it/s]
Looking up headjacc: 100%|██████████| 10/10 [00:00<00:00, 1611.64it/s]
Matching with lsh: 0%| | 0/10 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/10 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 10/10 [00:00<00:00, 1854.98it/s]
Looking up lsh: 100%|██████████| 10/10 [00:00<00:00, 1895.99it/s]
DEBUG:root:Calculating 10 lsh scores
Yielding lsh: 100%|██████████| 10/10 [00:00<00:00, 13430.37it/s]
Matching with emb: 0%| | 0/10 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/10 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 10/10 [00:00<00:00, 2443.09it/s]
Looking up emb: 100%|██████████| 10/10 [00:00<00:00, 3488.57it/s]
DEBUG:root:Calculating 10 emb scores
Yielding emb: 100%|██████████| 10/10 [00:00<00:00, 53430.62it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (4, 4) column similarities
DEBUG:root:Partition 2 has 4 tables and 1 column clusters
Matching with headjacc: 0%| | 0/10 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/10 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 10/10 [00:00<00:00, 2306.59it/s]
Looking up headjacc: 100%|██████████| 10/10 [00:00<00:00, 3189.34it/s]
Matching with lsh: 0%| | 0/10 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/10 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 10/10 [00:00<00:00, 2410.52it/s]
Looking up lsh: 100%|██████████| 10/10 [00:00<00:00, 3365.67it/s]
DEBUG:root:Calculating 10 lsh scores
Yielding lsh: 100%|██████████| 10/10 [00:00<00:00, 44243.71it/s]
Matching with emb: 0%| | 0/10 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/10 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 10/10 [00:00<00:00, 2460.72it/s]
Looking up emb: 100%|██████████| 10/10 [00:00<00:00, 3420.85it/s]
DEBUG:root:Calculating 10 emb scores
Yielding emb: 100%|██████████| 10/10 [00:00<00:00, 1987.26it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (8, 8) column similarities
DEBUG:root:Partition 3 has 4 tables and 2 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 717.87it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 798.76it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 754.05it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 1046.83it/s]
DEBUG:root:Calculating 3 lsh scores
Yielding lsh: 100%|██████████| 3/3 [00:00<00:00, 15015.41it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 747.74it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 1038.79it/s]
DEBUG:root:Calculating 3 emb scores
Yielding emb: 100%|██████████| 3/3 [00:00<00:00, 14513.16it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (4, 4) column similarities
DEBUG:root:Partition 4 has 2 tables and 2 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 677.81it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 923.31it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 752.79it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 1047.01it/s]
DEBUG:root:Calculating 49 lsh scores
Yielding lsh: 100%|██████████| 49/49 [00:00<00:00, 205110.67it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 733.06it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 813.48it/s]
DEBUG:root:Calculating 49 emb scores
Yielding emb: 100%|██████████| 49/49 [00:00<00:00, 197047.84it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (12, 12) column similarities
DEBUG:root:Partition 5 has 2 tables and 8 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 694.27it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 965.91it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 781.64it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 1115.51it/s]
DEBUG:root:Calculating 37 lsh scores
Yielding lsh: 100%|██████████| 37/37 [00:00<00:00, 148222.78it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 798.10it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 1099.71it/s]
DEBUG:root:Calculating 37 emb scores
Yielding emb: 100%|██████████| 37/37 [00:00<00:00, 177562.07it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (9, 9) column similarities
DEBUG:root:Partition 6 has 2 tables and 5 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 756.82it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 1082.03it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 759.79it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 1060.24it/s]
DEBUG:root:Calculating 75 lsh scores
Yielding lsh: 100%|██████████| 75/75 [00:00<00:00, 306004.67it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 801.36it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 1110.29it/s]
DEBUG:root:Calculating 75 emb scores
Yielding emb: 100%|██████████| 75/75 [00:00<00:00, 310535.83it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (10, 10) column similarities
DEBUG:root:Partition 7 has 2 tables and 5 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 568.03it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 728.85it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 737.09it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 1031.98it/s]
DEBUG:root:Calculating 313 lsh scores
Yielding lsh: 100%|██████████| 313/313 [00:00<00:00, 700169.15it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 744.64it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 1012.22it/s]
DEBUG:root:Calculating 313 emb scores
Yielding emb: 100%|██████████| 313/313 [00:00<00:00, 617215.40it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (22, 22) column similarities
DEBUG:root:Partition 8 has 2 tables and 19 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 595.47it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 946.65it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 796.39it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 607.78it/s]
DEBUG:root:Calculating 93 lsh scores
Yielding lsh: 100%|██████████| 93/93 [00:00<00:00, 363532.41it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 551.71it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 680.49it/s]
DEBUG:root:Calculating 93 emb scores
Yielding emb: 100%|██████████| 93/93 [00:00<00:00, 338602.67it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (15, 15) column similarities
DEBUG:root:Partition 9 has 2 tables and 8 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 466.14it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 578.18it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 760.76it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 1031.30it/s]
DEBUG:root:Calculating 271 lsh scores
Yielding lsh: 100%|██████████| 271/271 [00:00<00:00, 586510.00it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 734.00it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 1001.82it/s]
DEBUG:root:Calculating 271 emb scores
Yielding emb: 100%|██████████| 271/271 [00:00<00:00, 541007.32it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (27, 27) column similarities
DEBUG:root:Partition 10 has 2 tables and 16 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 738.00it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 1032.15it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 785.40it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 1096.74it/s]
DEBUG:root:Calculating 27 lsh scores
Yielding lsh: 100%|██████████| 27/27 [00:00<00:00, 100932.45it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 791.68it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 1097.03it/s]
DEBUG:root:Calculating 27 emb scores
Yielding emb: 100%|██████████| 27/27 [00:00<00:00, 4345.60it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (8, 8) column similarities
DEBUG:root:Partition 11 has 2 tables and 4 column clusters
Matching with headjacc: 0%| | 0/15 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/15 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 15/15 [00:00<00:00, 2833.99it/s]
Looking up headjacc: 100%|██████████| 15/15 [00:00<00:00, 2474.03it/s]
Matching with lsh: 0%| | 0/15 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/15 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 15/15 [00:00<00:00, 3951.92it/s]
Looking up lsh: 100%|██████████| 15/15 [00:00<00:00, 3141.64it/s]
DEBUG:root:Calculating 73 lsh scores
Yielding lsh: 100%|██████████| 73/73 [00:00<00:00, 241470.18it/s]
Matching with emb: 0%| | 0/15 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/15 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 15/15 [00:00<00:00, 3963.12it/s]
Looking up emb: 100%|██████████| 15/15 [00:00<00:00, 5353.52it/s]
DEBUG:root:Calculating 73 emb scores
Yielding emb: 100%|██████████| 73/73 [00:00<00:00, 298134.56it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (20, 20) column similarities
DEBUG:root:Partition 12 has 5 tables and 7 column clusters
Matching with headjacc: 0%| | 0/10 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/10 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 10/10 [00:00<00:00, 1925.32it/s]
Looking up headjacc: 100%|██████████| 10/10 [00:00<00:00, 2477.44it/s]
Matching with lsh: 0%| | 0/10 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/10 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 10/10 [00:00<00:00, 2413.57it/s]
Looking up lsh: 100%|██████████| 10/10 [00:00<00:00, 3281.41it/s]
DEBUG:root:Calculating 234 lsh scores
Yielding lsh: 100%|██████████| 234/234 [00:00<00:00, 565035.77it/s]
Matching with emb: 0%| | 0/10 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/10 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 10/10 [00:00<00:00, 1983.59it/s]
Looking up emb: 100%|██████████| 10/10 [00:00<00:00, 3409.45it/s]
DEBUG:root:Calculating 234 emb scores
Yielding emb: 100%|██████████| 234/234 [00:00<00:00, 568966.46it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (22, 22) column similarities
DEBUG:root:Partition 13 has 4 tables and 10 column clusters
Matching with headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Looking up headjacc: 0%| | 0/3 [00:00<?, ?it/s]
Matching with headjacc: 100%|██████████| 3/3 [00:00<00:00, 642.71it/s]
Looking up headjacc: 100%|██████████| 3/3 [00:00<00:00, 853.19it/s]
Matching with lsh: 0%| | 0/3 [00:00<?, ?it/s]
Looking up lsh: 0%| | 0/3 [00:00<?, ?it/s]
Matching with lsh: 100%|██████████| 3/3 [00:00<00:00, 751.35it/s]
Looking up lsh: 100%|██████████| 3/3 [00:00<00:00, 1057.30it/s]
DEBUG:root:Calculating 127 lsh scores
Yielding lsh: 100%|██████████| 127/127 [00:00<00:00, 354408.92it/s]
Matching with emb: 0%| | 0/3 [00:00<?, ?it/s]
Looking up emb: 0%| | 0/3 [00:00<?, ?it/s]
Matching with emb: 100%|██████████| 3/3 [00:00<00:00, 778.12it/s]
Looking up emb: 100%|██████████| 3/3 [00:00<00:00, 1094.55it/s]
DEBUG:root:Calculating 93 emb scores
Yielding emb: 100%|██████████| 93/93 [00:00<00:00, 319729.73it/s]
DEBUG:root:Creating colsim dataframe
DEBUG:root:Clustering (14, 14) column similarities
DEBUG:root:Partition 14 has 2 tables and 8 column clusters
[11]:
(50, 15, 244)
[61]:
clusters = tables.pipe(
cluster.set_partition_columns, ti_pi, pi_ncols, ci_pci
).fold(
lambda t: t["_id"],
lambda a, b: cluster.merge_partition_tables(
a,
b,
keep_partition_meta=["tableHeaders", lambda x: {'tableData': x["tableData"][:10]}],
),
).persist()
DEBUG:root:Piping set_partition_columns ...
[63]:
t = [t for t in clusters if t.get("partColAligns")][0]
takco.preview(t["partColAligns"])
[63]:
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link |
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link |
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link | |
Review scores | Allmusic |
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link | |
Review scores | Allmusic | ||
Review scores | Allmusic |
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link | |
Review scores | Allmusic | ||
Review scores | Allmusic |
(5 more rows)
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link | |
Review scores | Allmusic | ||
Review scores | Allmusic |
(5 more rows)
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link | |
Review scores | Allmusic | ||
Review scores | Allmusic |
(5 more rows)
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link | |
Review scores | Allmusic | ||
Review scores | Allmusic |
(5 more rows)
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link | |
Review scores | Allmusic | ||
Review scores | Allmusic |
(5 more rows)
? | 0 | 1 | 2 |
---|---|---|---|
Source | Rating | ||
Review scores | Allmusic | link | |
Review scores | Entertainment Weekly | (B) link | |
Review scores | Allmusic | link | |
Review scores | Allmusic | ||
Review scores | Allmusic |
(5 more rows)