T2D benchmark
[28]:
import logging as log
log.getLogger().setLevel(log.INFO)
import takco
dataset = takco.evaluate.dataset.T2D(
name='t2d-v1', version=1, resourcedir='resources', datadir='data',
download = [
'http://webdatacommons.org/webtables/tables_instance.tar.gz',
'http://webdatacommons.org/webtables/entities_instance.tar.gz',
'http://webdatacommons.org/webtables/classes_instance.csv',
'http://webdatacommons.org/webtables/attributes_instance.tar.gz',
]
)
# dataset = takco.evaluate.dataset.T2D(
# resourcedir='resources',
# datadir='data',
# tabledir = 'data/taipan/TAIPAN-Datasets-master/T2DStar/tables',
# # entitydir = 'data/t2d-v2/instance',
# classfile = 'data/t2d-v2/classes_GS.csv',
# # propdir = 'data/t2d-v2/property',
# keycolfile = 'data/taipan/TAIPAN-Datasets-master/T2DStar/subject_column.csv',
# propdir = 'data/taipan/TAIPAN-Datasets-master/T2DStar/properties_t2d_format',
# )
takco.preview(dataset.get_annotated_tables_as_predictions())
INFO:root:Downloading http://webdatacommons.org/webtables/tables_instance.tar.gz to data/t2d-v1/tables_instance.tar.gz
INFO:root:Unpacking data/t2d-v1/tables_instance.tar.gz to data/t2d-v1/tables_instance
INFO:root:Downloading http://webdatacommons.org/webtables/entities_instance.tar.gz to data/t2d-v1/entities_instance.tar.gz
INFO:root:Unpacking data/t2d-v1/entities_instance.tar.gz to data/t2d-v1/entities_instance
INFO:root:Downloading http://webdatacommons.org/webtables/classes_instance.csv to data/t2d-v1/classes_instance.csv
INFO:root:Downloading http://webdatacommons.org/webtables/attributes_instance.tar.gz to data/t2d-v1/attributes_instance.tar.gz
INFO:root:Unpacking data/t2d-v1/attributes_instance.tar.gz to data/t2d-v1/attributes_instance
INFO:root:Read 233 tables from data/t2d-v1/tables_instance
INFO:root:Read 233 key cols from data/t2d-v1/attributes_instance
INFO:root:Read 233 prop files from data/t2d-v1/attributes_instance
INFO:root:Read 233 class tables from data/t2d-v1/classes_instance.csv
INFO:root:Read 233 table headerrows from data/t2d-v1/entities_instance
INFO:root:Read 233 entity tables from data/t2d-v1/entities_instance
[28]:
| ? | 0 | 1 | 2 |
|---|---|---|---|
| ∈ |
|
||
| 0 |
|
||
| title | author | source | |
| adventures of huckleberry finn | mark twain | ala [11] | |
| the adventures of super diaper baby | dav pilkey | ala [47] | |
| the adventures of tom sawyer | mark twain | ala | |
| alice series | phyllis reynolds naylor | ala [2] | |
| all the king's men | robert penn warren | rad |
(146 more rows)
| ? | 0 | 1 | 2 |
|---|---|---|---|
| ∈ |
|
||
| # | media | mix | |
| 1 | dainik jagran | 27.500 | |
| 2 | dainik bhaskar | 14.000 | |
| 3 | aajtak tv | 7.000 | |
| 4 | cnn editions (international) | 6.000 | |
| 5 | dinakaran | 5.000 |
(16 more rows)
| ? | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|
| ∈ |
|
||||||
| 2 |
|
|
|
||||
| # | gebäude | gebäude | stadt | etagen | höhe | jahr | |
| 1 | NULL | burj khalifa | dubai | 163 | 2.717 ft | 2010 | |
| 2 | NULL | makkah clock royal tower [abraj al bait] | mekka | 95 | 1.972 ft | 2012 | |
| 3 | NULL | taipei 101 | taipei | 101 | 1.671 ft | 2004 | |
| 4 | NULL | shanghai world financial center | shanghai | 101 | 1.614 ft | 2008 | |
| 5 | NULL | international commerce centre [union square] | hong kong | 118 | 1.588 ft | 2010 |
(195 more rows)
| ? | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| ∈ |
|
||||
| 1 |
|
||||
| rank | company | industry | temkin experience rating (ter) | company ter vs industry ter | |
| 1 | sam's club | retailer | 85% | 13.0 | |
| 2 | publix | grocery chain | 81% | 4.9 | |
| 3 | a credit union | bank | 80% | 14.5 | |
| 3 | chick-fil-a | fast food chain | 80% | 6.2 | |
| 3 | subway | fast food chain | 80% | 6.4 |
(201 more rows)
| ? | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|
| ∈ |
|
||||||
| 0 |
|
||||||
| peak | ranking | map | guide | grid ref | alt (ft) | alt (m) | |
| allen crags | 43 | sw | e | ny 236 085 | 2572 | 784 | |
| angletarn pikes | 143 | ne | fe | ny 414 148 | 1857 | 566 | |
| ard crags | 142 | nw | nw | ny 207 197 | 1860 | 567 | |
| armboth fell | 182 | nw | c | ny 297 159 | 1570 | 479 | |
| arnison crag | 194 | ne | e | ny 394 150 | 1424 | 434 |
(210 more rows)
| ? | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| ∈ |
|
||||
| 1 |
|
|
|
|
|
| a | nom en anglais | endroit | capitale | heure | |
| afghanistan | afghanistan | asie | kabul | +4.5 | |
| afrique du sud | south afrique | afrique | pretoria | +2 | |
| albanie | albania | europe | tirane | +1 | |
| alderney (uk) voir îles (...) | alderney | europe | 0 | ||
| algérie | algeria | afrique | algiers | +1 |
(228 more rows)
| ? | 0 | 1 | 2 | 3 | 4 | 5 | 6 |
|---|---|---|---|---|---|---|---|
| ∈ |
|
||||||
| 1 |
|
|
|
||||
| NULL | country name: | population | area (sq. km.) | population density (sq. km.) | area (sq. mi.) | population density (sq. mi.) | |
| 36 | china | 1339190000 | 9596960.00 | 139.54 | 3705405.45 | 361.42 | |
| 77 | india | 1184639000 | 3287590.00 | 360.34 | 1269345.07 | 933.27 | |
| 183 | united states of america | 309975000 | 9629091.00 | 32.19 | 3717811.29 | 83.38 | |
| 78 | indonesia | 234181400 | 1919440.00 | 122.01 | 741099.62 | 315.99 | |
| 24 | brazil | 193364000 | 8511965.00 | 22.72 | 3286486.71 | 58.84 |
(188 more rows)
| ? | 0 | 1 | 2 | 3 | 4 | 5 |
|---|---|---|---|---|---|---|
| ∈ |
|
|||||
| 0 |
|
|
|
|||
| title | publisher | eu release date | au release date | pegi | acb | |
| donkey kong country | nintendo | 2006-12-08 | 2006-12-07 | 7 | g | |
| f-zero | nintendo | 2006-12-08 | 2006-12-07 | 3 | g | |
| simcity | nintendo | 2006-12-29 | 2006-12-29 | 3 | g | |
| super castlevania iv | konami | 2006-12-29 | 2006-12-29 | 3 | pg | |
| street fighter ii: the world warrior | capcom | 2007-01-19 | 2007-01-19 | 12 | pg |
(60 more rows)
| ? | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| ∈ |
|
||||
| 1 |
|
||||
| dial location | call letters | format | address | telephone | |
| am 790 | kabc (abc radio networks) | news/talk | 3321 s la cienega blvd los angeles 90016 | (310) 840-4900 | |
| am 900 | kali am | spanish news/talk | 747 e green st pasadena 91101 | (626) 844-8882 | |
| am 1300 | kazn(asian radio) | chinese variety | 747 e green st pasadena 91101 | (626) 568-1300 | |
| am 1580 | kbla | spanish news/talk | 123 figueroa st #101a los angeles 90012 | (213) 628-8700 | |
| am 740 | kbrt(k-bright) | religious talk | 3183-d airway ave costa mesa 92626 | (714) 754-4450 |
(25 more rows)
| ? | 0 | 1 | 2 |
|---|---|---|---|
| ∈ |
|
||
| local health boards | hospital name | link surgeons | |
| abertawe bro morgannwg university lhb | morriston hospital (swansea) | roger morgan | |
| singleton hospital (swansea) | roger morgan | ||
| princess of wales hospital (bridgend) | roger morgan | ||
| aneurin bevan lhb | neville hall hospital (abergavenny) | richard blackett | |
| royal gwent hospital (newport) | ahmed shandall |
(11 more rows)
[39]:
searcher = takco.config.build('dbpedia_t2ksubset_es', load='resources/pipelines/t2d-v2-baseline.toml')
print(searcher)
# Test
with searcher:
for rs in searcher.search_entities([('USA',{})]):
print([r.uri for r in rs])
ElasticSearcher(index='dbpediasub-7', baseuri='http://dbpedia.org/resource/', propbaseuri='http://dbpedia.org/ontology/', es_kwargs={'hosts': ['bricks07'], 'timeout': 60}, parts=True, prop_uri={'type': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'}, prop_baseuri={'type': 'http://dbpedia.org/ontology/'}, typer=SimpleTyper(use_dateparser='dateutil'), stringmatch='jaccard')
['http://dbpedia.org/resource/United_States']
[30]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8686')
client
[30]:
Client
|
Cluster
|
[43]:
%%time
baseline = takco.config.build('linker', load='resources/pipelines/t2d-v2-baseline.toml')
pred = takco.TableSet.link(
# takco.DaskHashBag(takco.TableSet.dataset(dataset), client=client),
takco.TableSet(list(takco.TableSet.dataset(dataset))[:10]),
usecols = "keycol",
linker = baseline,
)
pred.tables.persist()
report = pred.report(keycol_only=True, curve = True )
import pandas as pd
pd.DataFrame.from_dict( report.get('scores'), orient='index' )
CPU times: user 641 ms, sys: 33.6 ms, total: 675 ms
Wall time: 1.44 s
[43]:
| precision | recall | f1-score | support | predictions | |
|---|---|---|---|---|---|
| entities | 0.27907 | 0.433735 | 0.339623 | 166 | 258 |
[44]:
from takco.evaluate import pr_plot
pr_plot(report['curves'])
[44]:
[45]:
takco.preview( pred, nrows=None, ntables=25, hide_correct_rows=True )
[45]:
| ? | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| ∈ |
|
|||
| Brian Mulroney 💡 | ||||
|
Joe Clark
|
||||
| John George Diefenbaker 💡 | ||||
|
John Sparrow David Thompson
|
||||
| Lester Bowles Pearson 💡 | ||||
| Louis Stephen St . Laurent 💡 | ||||
|
Paul Martin
|
||||
| Richard Bedford Bennett 💡 | ||||
| ? | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| ∈ |
|
|
|||
| 0 |
|
||||
| 1 |
|
||||
|
Ontario College of Art & Design
|
Toronto
|
||||
| ? | 0 | 1 | 2 |
|---|---|---|---|
| ∈ |
|
|
|
| 0 |
|
||
| 1 |
|
||
|
Greater Yellowlegs
|
Tringa melanoleuca | ||
|
Lesser Yellowlegs
|
Tringa flavipes | ||
|
Willet
|
Catoptrophorus semipalmatus | ||
|
Wandering Tattler
|
Heteroscelus incanus | ||
|
Spotted Sandpiper
|
Actitis macularia
|
||
|
Upland Sandpiper
|
Bartramia longicauda | ||
|
Ruddy Turnstone
|
Arenaria interpres | ||
|
Black Turnstone
|
Arenaria melanocephala
|
||
|
Surfbird
|
Aphriza virgata | ||
|
Sanderling
|
Calidris alba | ||
|
Red - necked Stint
|
Calidris ruficollis | ||
|
Little Stint
|
Calidris minuta | ||
|
Least Sandpiper
|
Calidris minutilla
|
||
|
Baird's Sandpiper
|
Calidris bairdii | ||
|
Pectoral Sandpiper
|
Calidris melanotos | ||
|
Sharp - tailed Sandpiper
|
Calidris acuminata
|
||
|
Dunlin
|
Calidris alpina | ||
|
Curlew Sandpiper
|
Calidris ferruginea | ||
| Spoonbilleded Sandpiper | Eurynorhynchus pygmaeus | ||
|
Buff - breasted Sandpiper
|
Tryngites subruficollis
|
||
|
Ruff
|
Philomachus pugnax | ||
|
Long - billed Dowitcher
|
Limnodromus scolopaceus
|
||
|
Wilson's Snipe
|
Gallinago delicata
|
||
|
American Woodcock
|
Scolopax minor
|
||
|
Wilson's Phalarope
|
Phalaropus tricolor
|
||
|
Red - necked Phalarope
|
Phalaropus lobatus
|
||
|
Red Phalarope
|
Phalaropus fulicarius | ||
| ? | 0 | 1 | 2 | 3 | 4 | 5 |
|---|---|---|---|---|---|---|
| ∈ |
|
|||||
| 3 |
|
|||||
| Lisa's Rival 💡 |
Mike Scully
|
|||||
| Bart of Darkness 💡 | Dan McGrath | |||||
| Another Simpsons Clip Show 💡 | Penny Wise | |||||
| Itchy & Scratchy Land 💡 |
John Swartzwelder
|
|||||
| Sideshow Bob Roberts 💡 | Bill Oakley & Josh Weinstein | |||||
| Nightmare Cafeteria 💡 |
David S . Cohen
|
|||||
| Bart's Girlfriend |
John Collier
|
|||||
| Lisa on Ice 💡 |
Mike Scully
|
|||||
|
Homer : Bad Man
|
Greg Daniels
|
|||||
| Grampa vs . Sexual Inadequacy 💡 | Bill Oakley & Josh Weinstein | |||||
|
Fear of Flying
|
David Sacks | |||||
| Homer the Great 💡 |
John Swartzwelder
|
|||||
| A Star is Burns 💡 |
Ken Keeler
|
|||||
| And Maggie Makes Three 💡 | Jennifer Crittenden | |||||
| Bart's Comet |
John Swartzwelder
|
|||||
| Homie the Clown 💡 |
John Swartzwelder
|
|||||
| Bart vs . Australia 💡 | Bill Oakley & Josh Weinstein | |||||
| Homer vs . Patty & Selma 💡 | Brent Forrester | |||||
| Lisa's Wedding |
Greg Daniels
|
|||||
| Two Dozen and One Greyhounds |
Mike Scully
|
|||||
| The PTA Disbands 💡 | Jennifer Crittenden | |||||
| 'Round Springfield 💡 | Joshua Sternin & Jeff Ventimilia | |||||
| The Springfield Connection 💡 |
John Collier
|
|||||
| Lemon of Troy 💡 | Brent Forrester | |||||
|
Who Shot Mr . Burns ? ( Part One )
|
Bill Oakley & Josh Weinstein | |||||
| ? | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| ∈ |
|
|
||
| 0 |
|
|||
| Venta silurum 💡 |
Caerwent
|
|||
| Moridunum |
Carmarthen
|
|||
| Camulodonum 💡 |
Colchester
|
|||
|
Noviomagus
|
Chichester
|
|||
|
Deva
|
Chester
|
|||
| Durnovaria 💡 |
Dorchester
|
|||
| Glevum |
Gloucester
|
|||
| Ratae Coritanorum 💡 |
Leicester
|
|||
|
Mamucium
|
Manchester
|
|||
| Verulamium 💡 |
St Albans
|
|||
| Venta Belgarum 💡 |
Winchester
|
|||
|
Eboracum
|
York
|
|||
| ? | 0 | 1 | 2 |
|---|---|---|---|
| 1 The Secret of the Old Clock 💡 | 1959 | ||
| 2 The Hidden Staircase 💡 | 1959 | ||
| 3 The Bungalow Mystery 💡 | 1960 | ||
| 4 The Mystery at Lilac Inn 💡 | 1961 | ||
| 5 The Secret at Shadow Ranch 💡 | 1931 | ||
| 6 The Secret of Red Gate Farm 💡 | 1961 | ||
| 7 The Clue in the Diary 💡 | 1962 | ||
| 8 Nancy's Mysterious Letter 💡 | 1932 | ||
| 9 The Sign of the Twisted Candles 💡 | 1933 | ||
| 10 The Password to Larkspur Lane 💡 | 1933 | ||
| 11 The Clue of the Broken Locket 💡 | 1934 | ||
| 12 The Message in the Hollow Oak 💡 | 1935 | ||
| 13 The Mystery of the Ivory Charm 💡 | 1936 | ||
| 14 The Whispering Statue 💡 | 1937 | ||
| 15 The Haunted Bridge 💡 | 1937 | ||
| 16 The Clue of the Tapping Heels 💡 | 1939 | ||
| 17 The Mystery of the Brass - Bound Trunk 💡 | 1940 | ||
| 18 The Mystery at the Moss - Covered Mansion 💡 | 1941 | ||
| 19 The Quest of the Missing Map 💡 | 1942 | ||
| 20 The Clue in the Jewel Box 💡 | 1943 | ||
| 21 The Secret in the Old Attic 💡 | 1944 | ||
| 22 The Clue in the Crumbling Wall 💡 | 1945 | ||
| 23 The Mystery of the Tolling Bell 💡 | 1946 | ||
| 24 The Clue in the Old Album 💡 | 1947 | ||
| 25 The Ghost of Blackwood Hall 💡 | 1948 | ||
| 26 The Clue of the Leaning Chimney 💡 | 1949 | ||
| 27 The Secret of the Wooden lady 💡 | 1950 | ||
| 28 The Clue of the Black Keys 💡 | 1951 | ||
| 29 The Mystery at the Ski Jump | 1952 | ||
| 30 The Clue of the Velvet Mask | 1953 | ||
| 31 The Ringmaster's Secret | 1953 | ||
| 32 The Scarlet Slipper Mystery | 1954 | ||
| ? | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| ∈ |
|
|
|
|
| 1 |
|
|||
| 2 |
|
|||
| Martin Sheen 💡 |
Josiah "Jed" Bartlet
|
President
|
||
| Bradley Whitford 💡 |
Josh Lyman
|
|||
| Richard Schiff 💡 | Tobias Zachary "Toby" Ziegler | |||
|
John Spencer
|
Leo McGarry
|
|||
| Dule Hill 💡 |
Charlie Young
|
|||
|
Stockard Channing
|
Abigail "Abbey" Bartlet | |||
| Jimmy Smits 💡 |
Matt Santos [ eps 114 - ]
|
|||
| ? | 0 | 1 |
|---|---|---|
| ∈ |
|
|
|
Snow White and the Seven Dwarfs
|
1937 | |
|
Pinocchio
|
1940 | |
|
Fantasia
|
1940 | |
|
The Reluctant Dragon
|
1941 | |
|
The Adventures of Ichabod and Mr . Toad
|
1949 | |
|
Cinderella
|
1950 | |
|
Alice in Wonderland
|
1951 | |
|
Peter Pan
|
1953 | |
|
20 , 000 Leagues Under the Sea
|
1954 | |
|
Sleeping Beauty
|
1959 | |
|
101 Dalmatians
|
1961 | |
|
Mary Poppins
|
1964 | |
|
The Aristocats
|
1970 | |
|
Bedknobs and Broomsticks
|
1971 | |
|
Robin Hood
|
1973 | |
|
The Rescuers
|
1977 | |
|
The Fox and the Hound
|
1981 | |
|
Who Framed Roger Rabbit
|
1988 | |
|
Oliver & Company
|
1988 | |
|
The Little Mermaid
|
1989 | |
|
Beauty and the Beast
|
1991 | |
|
The Nightmare Before Christmas
|
1993 | |
|
The Return of Jafar
|
1994 | |
|
A Goofy Movie
|
1995 | |
|
Pocahontas
|
1995 | |
|
Toy Story
|
1995 | |
|
James and the Giant Peach
|
1996 | |
|
The Hunchback of Notre Dame
|
1996 | |
|
Hercules
|
1997 | |
|
Mulan
|
1998 | |
|
A Bug's Life
|
1998 | |
|
Tarzan
|
1999 | |
|
Toy Story 2
|
1999 | |
|
Fantasia / 2000
|
2000 | |
|
The Tigger Movie
|
2000 | |
|
Recess : School's Out
|
2001 | |
|
Atlantis : The Lost Empire
|
2001 | |
|
Monsters , Inc .
|
2001 | |
|
Return to Never Land
|
2002 | |
|
Lilo & Stitch
|
2002 | |
|
The Jungle Book 2
|
2003 | |
|
Piglet's Big Movie
|
2003 | |
|
Finding Nemo
|
2003 | |
|
Teacher's Pet
|
2004 | |
|
The Lion King 3 : Hakuna Matata
|
2004 | |
|
Home on the Range
|
2004 | |
|
The Incredibles
|
2004 | |
|
Chicken Little
|
2005 | |
| ? | 0 | 1 | 2 | 3 |
|---|---|---|---|---|
| ∈ |
|
|
||
| 0 |
|
|||
| 1 |
|
|||
|
Grant MacEwan College
|
Edmonton
|
|||
| Lethbridge Community College 💡 |
Lethbridge
|
|||
|
Maskwachees Cultural College
|
Hobbema
|
|||
| Medicine Hat College 💡 |
Medicine Hat
|
|||
|
Mount Royal College
|
Calgary
|
|||
|
Northern Alberta Institute of Technology
|
Edmonton
|
|||
| ? | 0 | 1 | 2 | 3 | 4 |
|---|---|---|---|---|---|
| 1831 - 1865 |
Leopold I
|
||||
| 1865 - 1909 |
Leopold II
|
||||
| 1909 - 1934 |
Albert I
|
||||
| 1934 - 1951 |
Leopold III
|
||||
| 1951 - 1993 |
Baudouin I
|
||||
| 1993 |
Albert II
|
||||
[ ]: