T2D benchmark
[28]:
import logging as log
log.getLogger().setLevel(log.INFO)
import takco
dataset = takco.evaluate.dataset.T2D(
name='t2d-v1', version=1, resourcedir='resources', datadir='data',
download = [
'http://webdatacommons.org/webtables/tables_instance.tar.gz',
'http://webdatacommons.org/webtables/entities_instance.tar.gz',
'http://webdatacommons.org/webtables/classes_instance.csv',
'http://webdatacommons.org/webtables/attributes_instance.tar.gz',
]
)
# dataset = takco.evaluate.dataset.T2D(
# resourcedir='resources',
# datadir='data',
# tabledir = 'data/taipan/TAIPAN-Datasets-master/T2DStar/tables',
# # entitydir = 'data/t2d-v2/instance',
# classfile = 'data/t2d-v2/classes_GS.csv',
# # propdir = 'data/t2d-v2/property',
# keycolfile = 'data/taipan/TAIPAN-Datasets-master/T2DStar/subject_column.csv',
# propdir = 'data/taipan/TAIPAN-Datasets-master/T2DStar/properties_t2d_format',
# )
takco.preview(dataset.get_annotated_tables_as_predictions())
INFO:root:Downloading http://webdatacommons.org/webtables/tables_instance.tar.gz to data/t2d-v1/tables_instance.tar.gz
INFO:root:Unpacking data/t2d-v1/tables_instance.tar.gz to data/t2d-v1/tables_instance
INFO:root:Downloading http://webdatacommons.org/webtables/entities_instance.tar.gz to data/t2d-v1/entities_instance.tar.gz
INFO:root:Unpacking data/t2d-v1/entities_instance.tar.gz to data/t2d-v1/entities_instance
INFO:root:Downloading http://webdatacommons.org/webtables/classes_instance.csv to data/t2d-v1/classes_instance.csv
INFO:root:Downloading http://webdatacommons.org/webtables/attributes_instance.tar.gz to data/t2d-v1/attributes_instance.tar.gz
INFO:root:Unpacking data/t2d-v1/attributes_instance.tar.gz to data/t2d-v1/attributes_instance
INFO:root:Read 233 tables from data/t2d-v1/tables_instance
INFO:root:Read 233 key cols from data/t2d-v1/attributes_instance
INFO:root:Read 233 prop files from data/t2d-v1/attributes_instance
INFO:root:Read 233 class tables from data/t2d-v1/classes_instance.csv
INFO:root:Read 233 table headerrows from data/t2d-v1/entities_instance
INFO:root:Read 233 entity tables from data/t2d-v1/entities_instance
[28]:
? | 0 | 1 | 2 |
---|---|---|---|
∈ |
|
||
0 |
|
||
title | author | source | |
adventures of huckleberry finn | mark twain | ala [11] | |
the adventures of super diaper baby | dav pilkey | ala [47] | |
the adventures of tom sawyer | mark twain | ala | |
alice series | phyllis reynolds naylor | ala [2] | |
all the king's men | robert penn warren | rad |
(146 more rows)
? | 0 | 1 | 2 |
---|---|---|---|
∈ |
|
||
# | media | mix | |
1 | dainik jagran | 27.500 | |
2 | dainik bhaskar | 14.000 | |
3 | aajtak tv | 7.000 | |
4 | cnn editions (international) | 6.000 | |
5 | dinakaran | 5.000 |
(16 more rows)
? | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|---|
∈ |
|
||||||
2 |
|
|
|
||||
# | gebäude | gebäude | stadt | etagen | höhe | jahr | |
1 | NULL | burj khalifa | dubai | 163 | 2.717 ft | 2010 | |
2 | NULL | makkah clock royal tower [abraj al bait] | mekka | 95 | 1.972 ft | 2012 | |
3 | NULL | taipei 101 | taipei | 101 | 1.671 ft | 2004 | |
4 | NULL | shanghai world financial center | shanghai | 101 | 1.614 ft | 2008 | |
5 | NULL | international commerce centre [union square] | hong kong | 118 | 1.588 ft | 2010 |
(195 more rows)
? | 0 | 1 | 2 | 3 | 4 |
---|---|---|---|---|---|
∈ |
|
||||
1 |
|
||||
rank | company | industry | temkin experience rating (ter) | company ter vs industry ter | |
1 | sam's club | retailer | 85% | 13.0 | |
2 | publix | grocery chain | 81% | 4.9 | |
3 | a credit union | bank | 80% | 14.5 | |
3 | chick-fil-a | fast food chain | 80% | 6.2 | |
3 | subway | fast food chain | 80% | 6.4 |
(201 more rows)
? | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|---|
∈ |
|
||||||
0 |
|
||||||
peak | ranking | map | guide | grid ref | alt (ft) | alt (m) | |
allen crags | 43 | sw | e | ny 236 085 | 2572 | 784 | |
angletarn pikes | 143 | ne | fe | ny 414 148 | 1857 | 566 | |
ard crags | 142 | nw | nw | ny 207 197 | 1860 | 567 | |
armboth fell | 182 | nw | c | ny 297 159 | 1570 | 479 | |
arnison crag | 194 | ne | e | ny 394 150 | 1424 | 434 |
(210 more rows)
? | 0 | 1 | 2 | 3 | 4 |
---|---|---|---|---|---|
∈ |
|
||||
1 |
|
|
|
|
|
a | nom en anglais | endroit | capitale | heure | |
afghanistan | afghanistan | asie | kabul | +4.5 | |
afrique du sud | south afrique | afrique | pretoria | +2 | |
albanie | albania | europe | tirane | +1 | |
alderney (uk) voir îles (...) | alderney | europe | 0 | ||
algérie | algeria | afrique | algiers | +1 |
(228 more rows)
? | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
---|---|---|---|---|---|---|---|
∈ |
|
||||||
1 |
|
|
|
||||
NULL | country name: | population | area (sq. km.) | population density (sq. km.) | area (sq. mi.) | population density (sq. mi.) | |
36 | china | 1339190000 | 9596960.00 | 139.54 | 3705405.45 | 361.42 | |
77 | india | 1184639000 | 3287590.00 | 360.34 | 1269345.07 | 933.27 | |
183 | united states of america | 309975000 | 9629091.00 | 32.19 | 3717811.29 | 83.38 | |
78 | indonesia | 234181400 | 1919440.00 | 122.01 | 741099.62 | 315.99 | |
24 | brazil | 193364000 | 8511965.00 | 22.72 | 3286486.71 | 58.84 |
(188 more rows)
? | 0 | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|---|
∈ |
|
|||||
0 |
|
|
|
|||
title | publisher | eu release date | au release date | pegi | acb | |
donkey kong country | nintendo | 2006-12-08 | 2006-12-07 | 7 | g | |
f-zero | nintendo | 2006-12-08 | 2006-12-07 | 3 | g | |
simcity | nintendo | 2006-12-29 | 2006-12-29 | 3 | g | |
super castlevania iv | konami | 2006-12-29 | 2006-12-29 | 3 | pg | |
street fighter ii: the world warrior | capcom | 2007-01-19 | 2007-01-19 | 12 | pg |
(60 more rows)
? | 0 | 1 | 2 | 3 | 4 |
---|---|---|---|---|---|
∈ |
|
||||
1 |
|
||||
dial location | call letters | format | address | telephone | |
am 790 | kabc (abc radio networks) | news/talk | 3321 s la cienega blvd los angeles 90016 | (310) 840-4900 | |
am 900 | kali am | spanish news/talk | 747 e green st pasadena 91101 | (626) 844-8882 | |
am 1300 | kazn(asian radio) | chinese variety | 747 e green st pasadena 91101 | (626) 568-1300 | |
am 1580 | kbla | spanish news/talk | 123 figueroa st #101a los angeles 90012 | (213) 628-8700 | |
am 740 | kbrt(k-bright) | religious talk | 3183-d airway ave costa mesa 92626 | (714) 754-4450 |
(25 more rows)
? | 0 | 1 | 2 |
---|---|---|---|
∈ |
|
||
local health boards | hospital name | link surgeons | |
abertawe bro morgannwg university lhb | morriston hospital (swansea) | roger morgan | |
singleton hospital (swansea) | roger morgan | ||
princess of wales hospital (bridgend) | roger morgan | ||
aneurin bevan lhb | neville hall hospital (abergavenny) | richard blackett | |
royal gwent hospital (newport) | ahmed shandall |
(11 more rows)
[39]:
searcher = takco.config.build('dbpedia_t2ksubset_es', load='resources/pipelines/t2d-v2-baseline.toml')
print(searcher)
# Test
with searcher:
for rs in searcher.search_entities([('USA',{})]):
print([r.uri for r in rs])
ElasticSearcher(index='dbpediasub-7', baseuri='http://dbpedia.org/resource/', propbaseuri='http://dbpedia.org/ontology/', es_kwargs={'hosts': ['bricks07'], 'timeout': 60}, parts=True, prop_uri={'type': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'}, prop_baseuri={'type': 'http://dbpedia.org/ontology/'}, typer=SimpleTyper(use_dateparser='dateutil'), stringmatch='jaccard')
['http://dbpedia.org/resource/United_States']
[30]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8686')
client
[30]:
Client
|
Cluster
|
[43]:
%%time
baseline = takco.config.build('linker', load='resources/pipelines/t2d-v2-baseline.toml')
pred = takco.TableSet.link(
# takco.DaskHashBag(takco.TableSet.dataset(dataset), client=client),
takco.TableSet(list(takco.TableSet.dataset(dataset))[:10]),
usecols = "keycol",
linker = baseline,
)
pred.tables.persist()
report = pred.report(keycol_only=True, curve = True )
import pandas as pd
pd.DataFrame.from_dict( report.get('scores'), orient='index' )
CPU times: user 641 ms, sys: 33.6 ms, total: 675 ms
Wall time: 1.44 s
[43]:
precision | recall | f1-score | support | predictions | |
---|---|---|---|---|---|
entities | 0.27907 | 0.433735 | 0.339623 | 166 | 258 |
[44]:
from takco.evaluate import pr_plot
pr_plot(report['curves'])
[44]:
[45]:
takco.preview( pred, nrows=None, ntables=25, hide_correct_rows=True )
[45]:
? | 0 | 1 | 2 | 3 |
---|---|---|---|---|
∈ |
|
|||
Brian Mulroney 💡 | ||||
Joe Clark
|
||||
John George Diefenbaker 💡 | ||||
John Sparrow David Thompson
|
||||
Lester Bowles Pearson 💡 | ||||
Louis Stephen St . Laurent 💡 | ||||
Paul Martin
|
||||
Richard Bedford Bennett 💡 | ||||
? | 0 | 1 | 2 | 3 | 4 |
---|---|---|---|---|---|
∈ |
|
|
|||
0 |
|
||||
1 |
|
||||
Ontario College of Art & Design
|
Toronto
|
||||
? | 0 | 1 | 2 |
---|---|---|---|
∈ |
|
|
|
0 |
|
||
1 |
|
||
Greater Yellowlegs
|
Tringa melanoleuca | ||
Lesser Yellowlegs
|
Tringa flavipes | ||
Willet
|
Catoptrophorus semipalmatus | ||
Wandering Tattler
|
Heteroscelus incanus | ||
Spotted Sandpiper
|
Actitis macularia
|
||
Upland Sandpiper
|
Bartramia longicauda | ||
Ruddy Turnstone
|
Arenaria interpres | ||
Black Turnstone
|
Arenaria melanocephala
|
||
Surfbird
|
Aphriza virgata | ||
Sanderling
|
Calidris alba | ||
Red - necked Stint
|
Calidris ruficollis | ||
Little Stint
|
Calidris minuta | ||
Least Sandpiper
|
Calidris minutilla
|
||
Baird's Sandpiper
|
Calidris bairdii | ||
Pectoral Sandpiper
|
Calidris melanotos | ||
Sharp - tailed Sandpiper
|
Calidris acuminata
|
||
Dunlin
|
Calidris alpina | ||
Curlew Sandpiper
|
Calidris ferruginea | ||
Spoonbilleded Sandpiper | Eurynorhynchus pygmaeus | ||
Buff - breasted Sandpiper
|
Tryngites subruficollis
|
||
Ruff
|
Philomachus pugnax | ||
Long - billed Dowitcher
|
Limnodromus scolopaceus
|
||
Wilson's Snipe
|
Gallinago delicata
|
||
American Woodcock
|
Scolopax minor
|
||
Wilson's Phalarope
|
Phalaropus tricolor
|
||
Red - necked Phalarope
|
Phalaropus lobatus
|
||
Red Phalarope
|
Phalaropus fulicarius | ||
? | 0 | 1 | 2 | 3 | 4 | 5 |
---|---|---|---|---|---|---|
∈ |
|
|||||
3 |
|
|||||
Lisa's Rival 💡 |
Mike Scully
|
|||||
Bart of Darkness 💡 | Dan McGrath | |||||
Another Simpsons Clip Show 💡 | Penny Wise | |||||
Itchy & Scratchy Land 💡 |
John Swartzwelder
|
|||||
Sideshow Bob Roberts 💡 | Bill Oakley & Josh Weinstein | |||||
Nightmare Cafeteria 💡 |
David S . Cohen
|
|||||
Bart's Girlfriend |
John Collier
|
|||||
Lisa on Ice 💡 |
Mike Scully
|
|||||
Homer : Bad Man
|
Greg Daniels
|
|||||
Grampa vs . Sexual Inadequacy 💡 | Bill Oakley & Josh Weinstein | |||||
Fear of Flying
|
David Sacks | |||||
Homer the Great 💡 |
John Swartzwelder
|
|||||
A Star is Burns 💡 |
Ken Keeler
|
|||||
And Maggie Makes Three 💡 | Jennifer Crittenden | |||||
Bart's Comet |
John Swartzwelder
|
|||||
Homie the Clown 💡 |
John Swartzwelder
|
|||||
Bart vs . Australia 💡 | Bill Oakley & Josh Weinstein | |||||
Homer vs . Patty & Selma 💡 | Brent Forrester | |||||
Lisa's Wedding |
Greg Daniels
|
|||||
Two Dozen and One Greyhounds |
Mike Scully
|
|||||
The PTA Disbands 💡 | Jennifer Crittenden | |||||
'Round Springfield 💡 | Joshua Sternin & Jeff Ventimilia | |||||
The Springfield Connection 💡 |
John Collier
|
|||||
Lemon of Troy 💡 | Brent Forrester | |||||
Who Shot Mr . Burns ? ( Part One )
|
Bill Oakley & Josh Weinstein | |||||
? | 0 | 1 | 2 | 3 |
---|---|---|---|---|
∈ |
|
|
||
0 |
|
|||
Venta silurum 💡 |
Caerwent
|
|||
Moridunum |
Carmarthen
|
|||
Camulodonum 💡 |
Colchester
|
|||
Noviomagus
|
Chichester
|
|||
Deva
|
Chester
|
|||
Durnovaria 💡 |
Dorchester
|
|||
Glevum |
Gloucester
|
|||
Ratae Coritanorum 💡 |
Leicester
|
|||
Mamucium
|
Manchester
|
|||
Verulamium 💡 |
St Albans
|
|||
Venta Belgarum 💡 |
Winchester
|
|||
Eboracum
|
York
|
|||
? | 0 | 1 | 2 |
---|---|---|---|
1 The Secret of the Old Clock 💡 | 1959 | ||
2 The Hidden Staircase 💡 | 1959 | ||
3 The Bungalow Mystery 💡 | 1960 | ||
4 The Mystery at Lilac Inn 💡 | 1961 | ||
5 The Secret at Shadow Ranch 💡 | 1931 | ||
6 The Secret of Red Gate Farm 💡 | 1961 | ||
7 The Clue in the Diary 💡 | 1962 | ||
8 Nancy's Mysterious Letter 💡 | 1932 | ||
9 The Sign of the Twisted Candles 💡 | 1933 | ||
10 The Password to Larkspur Lane 💡 | 1933 | ||
11 The Clue of the Broken Locket 💡 | 1934 | ||
12 The Message in the Hollow Oak 💡 | 1935 | ||
13 The Mystery of the Ivory Charm 💡 | 1936 | ||
14 The Whispering Statue 💡 | 1937 | ||
15 The Haunted Bridge 💡 | 1937 | ||
16 The Clue of the Tapping Heels 💡 | 1939 | ||
17 The Mystery of the Brass - Bound Trunk 💡 | 1940 | ||
18 The Mystery at the Moss - Covered Mansion 💡 | 1941 | ||
19 The Quest of the Missing Map 💡 | 1942 | ||
20 The Clue in the Jewel Box 💡 | 1943 | ||
21 The Secret in the Old Attic 💡 | 1944 | ||
22 The Clue in the Crumbling Wall 💡 | 1945 | ||
23 The Mystery of the Tolling Bell 💡 | 1946 | ||
24 The Clue in the Old Album 💡 | 1947 | ||
25 The Ghost of Blackwood Hall 💡 | 1948 | ||
26 The Clue of the Leaning Chimney 💡 | 1949 | ||
27 The Secret of the Wooden lady 💡 | 1950 | ||
28 The Clue of the Black Keys 💡 | 1951 | ||
29 The Mystery at the Ski Jump | 1952 | ||
30 The Clue of the Velvet Mask | 1953 | ||
31 The Ringmaster's Secret | 1953 | ||
32 The Scarlet Slipper Mystery | 1954 | ||
? | 0 | 1 | 2 | 3 |
---|---|---|---|---|
∈ |
|
|
|
|
1 |
|
|||
2 |
|
|||
Martin Sheen 💡 |
Josiah "Jed" Bartlet
|
President
|
||
Bradley Whitford 💡 |
Josh Lyman
|
|||
Richard Schiff 💡 | Tobias Zachary "Toby" Ziegler | |||
John Spencer
|
Leo McGarry
|
|||
Dule Hill 💡 |
Charlie Young
|
|||
Stockard Channing
|
Abigail "Abbey" Bartlet | |||
Jimmy Smits 💡 |
Matt Santos [ eps 114 - ]
|
|||
? | 0 | 1 |
---|---|---|
∈ |
|
|
Snow White and the Seven Dwarfs
|
1937 | |
Pinocchio
|
1940 | |
Fantasia
|
1940 | |
The Reluctant Dragon
|
1941 | |
The Adventures of Ichabod and Mr . Toad
|
1949 | |
Cinderella
|
1950 | |
Alice in Wonderland
|
1951 | |
Peter Pan
|
1953 | |
20 , 000 Leagues Under the Sea
|
1954 | |
Sleeping Beauty
|
1959 | |
101 Dalmatians
|
1961 | |
Mary Poppins
|
1964 | |
The Aristocats
|
1970 | |
Bedknobs and Broomsticks
|
1971 | |
Robin Hood
|
1973 | |
The Rescuers
|
1977 | |
The Fox and the Hound
|
1981 | |
Who Framed Roger Rabbit
|
1988 | |
Oliver & Company
|
1988 | |
The Little Mermaid
|
1989 | |
Beauty and the Beast
|
1991 | |
The Nightmare Before Christmas
|
1993 | |
The Return of Jafar
|
1994 | |
A Goofy Movie
|
1995 | |
Pocahontas
|
1995 | |
Toy Story
|
1995 | |
James and the Giant Peach
|
1996 | |
The Hunchback of Notre Dame
|
1996 | |
Hercules
|
1997 | |
Mulan
|
1998 | |
A Bug's Life
|
1998 | |
Tarzan
|
1999 | |
Toy Story 2
|
1999 | |
Fantasia / 2000
|
2000 | |
The Tigger Movie
|
2000 | |
Recess : School's Out
|
2001 | |
Atlantis : The Lost Empire
|
2001 | |
Monsters , Inc .
|
2001 | |
Return to Never Land
|
2002 | |
Lilo & Stitch
|
2002 | |
The Jungle Book 2
|
2003 | |
Piglet's Big Movie
|
2003 | |
Finding Nemo
|
2003 | |
Teacher's Pet
|
2004 | |
The Lion King 3 : Hakuna Matata
|
2004 | |
Home on the Range
|
2004 | |
The Incredibles
|
2004 | |
Chicken Little
|
2005 | |
? | 0 | 1 | 2 | 3 |
---|---|---|---|---|
∈ |
|
|
||
0 |
|
|||
1 |
|
|||
Grant MacEwan College
|
Edmonton
|
|||
Lethbridge Community College 💡 |
Lethbridge
|
|||
Maskwachees Cultural College
|
Hobbema
|
|||
Medicine Hat College 💡 |
Medicine Hat
|
|||
Mount Royal College
|
Calgary
|
|||
Northern Alberta Institute of Technology
|
Edmonton
|
|||
? | 0 | 1 | 2 | 3 | 4 |
---|---|---|---|---|---|
1831 - 1865 |
Leopold I
|
||||
1865 - 1909 |
Leopold II
|
||||
1909 - 1934 |
Albert I
|
||||
1934 - 1951 |
Leopold III
|
||||
1951 - 1993 |
Baudouin I
|
||||
1993 |
Albert II
|
||||
[ ]: