Publications by Michael J. Cafarella
2017
Commun. ACM, -, January 2017
The dark data extraction or knowledge base construction (KBC) problem is to populate a SQL database with information from unstructured data sources including emails, webpages, and pdf reports. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems. The key idea in DeepDive is that statistical inference and machine learning are key tools to attack classical data problems in extraction, cleaning, and integration in a unified and more effective manner. DeepDive programs are declarative in that one cannot write probabilistic inference algorithms; instead, one interacts by defining features or rules about the domain. A key reason for this design choice is to enable domain experts to build their own KBC systems. We present the applications, abstractions, and techniques of DeepDive employed to accelerate construction of KBC systems.
@inproceedings{abc, abstract = {The dark data extraction or knowledge base construction (KBC) problem is to populate a SQL database with information from unstructured data sources including emails, webpages, and pdf reports. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems. The key idea in DeepDive is that statistical inference and machine learning are key tools to attack classical data problems in extraction, cleaning, and integration in a unified and more effective manner. DeepDive programs are declarative in that one cannot write probabilistic inference algorithms; instead, one interacts by defining features or rules about the domain. A key reason for this design choice is to enable domain experts to build their own KBC systems. We present the applications, abstractions, and techniques of DeepDive employed to accelerate construction of KBC systems.}, author = {Ce Zhang and Christopher R{\'e} and Michael J. Cafarella and Jaeho Shin and Feiran Wang and Sen Wu}, booktitle = {Commun. ACM}, title = {DeepDive: declarative knowledge base construction.}, url = {http://doi.acm.org/10.1145/3060586}, venue = {-}, year = {2017} }
2016
Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 2016
DeepDive is a system for extracting relational databases from dark data: the mass of text, tables, and images that are widely collected and stored but which cannot be exploited by standard relational tools. If the information in dark data - scientific papers, Web classified ads, customer service notes, and so on - were instead in a relational database, it would give analysts a massive and valuable new set of "big data." DeepDive is distinctive when compared to previous information extraction systems in its ability to obtain very high precision and recall at reasonable engineering cost; in a number of applications, we have used DeepDive to create databases with accuracy that meets that of human annotators. To date we have successfully deployed DeepDive to create data-centric applications for insurance, materials science, genomics, paleontologists, law enforcement, and others. The data unlocked by DeepDive represents a massive opportunity for industry, government, and scientific researchers. DeepDive is enabled by an unusual design that combines large-scale probabilistic inference with a novel developer interaction cycle. This design is enabled by several core innovations around probabilistic training and inference.
@inproceedings{abc, abstract = {DeepDive is a system for extracting relational databases from dark data: the mass of text, tables, and images that are widely collected and stored but which cannot be exploited by standard relational tools. If the information in dark data - scientific papers, Web classified ads, customer service notes, and so on - were instead in a relational database, it would give analysts a massive and valuable new set of "big data." DeepDive is distinctive when compared to previous information extraction systems in its ability to obtain very high precision and recall at reasonable engineering cost; in a number of applications, we have used DeepDive to create databases with accuracy that meets that of human annotators. To date we have successfully deployed DeepDive to create data-centric applications for insurance, materials science, genomics, paleontologists, law enforcement, and others. The data unlocked by DeepDive represents a massive opportunity for industry, government, and scientific researchers. DeepDive is enabled by an unusual design that combines large-scale probabilistic inference with a novel developer interaction cycle. This design is enabled by several core innovations around probabilistic training and inference.}, author = {Ce Zhang and Jaeho Shin and Christopher R{\'e} and Michael J. Cafarella and Feng Niu}, booktitle = {Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016}, title = {Extracting Databases from Dark Data with DeepDive.}, url = {http://doi.acm.org/10.1145/2882903.2904442}, venue = {San Francisco, CA, USA}, year = {2016} }
2013
CIDR 2013, Sixth Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 2013
@inproceedings{abc, author = {Michael Anderson and Dolan Antenucci and Victor Bittorf and Matthew Burgess and Michael J. Cafarella and Arun Kumar and Feng Niu and Yongjoo Park and Christopher R{\'e} and Ce Zhang}, booktitle = {CIDR 2013, Sixth Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA}, title = {Brainwash: A Data System for Feature Engineering.}, url = {http://www.cidrdb.org/cidr2013/Papers/CIDR13_Paper82.pdf}, year = {2013} }
2009
PVLDB, January 2009
@inproceedings{abc, author = {Daniel J. Abadi and Michael J. Cafarella and Joseph M. Hellerstein and Donald Kossmann and Samuel Madden and Philip A. Bernstein}, booktitle = {PVLDB}, title = {How Best to Build Web-Scale Data Managers? A Panel Discussion.}, url = {http://www.vldb.org/pvldb/2/vldb09-panel2.pdf}, year = {2009} }