Publications by Ce Zhang
2019
Proceedings of the VLDB 2019, Los Angeles, CA, USA, August 2019
Learning from the data stored in a database is an important function
increasingly available in relational engines. Methods using
lower precision input data are of special interest given their overall
higher efficiency. However, in databases, these methods have a
hidden cost: the quantization of the real value into a smaller number
is an expensive step. To address this issue, we present MLWeaving,
a data structure and hardware acceleration technique intended
to speed up learning of generalized linear models over low
precision data. MLWeaving provides a compact in-memory representation
that enables the retrieval of data at any level of precision.
MLWeaving also provides a highly efficient implementation
of stochastic gradient descent on FPGAs and enables the dynamic
tuning of precision, instead of using a fixed precision level during
learning. Experimental results show that MLWeaving converges
up to 16 faster than low-precision implementations of first-order
methods on CPUs.
@inproceedings{abc, abstract = {Learning from the data stored in a database is an important function increasingly available in relational engines. Methods using lower precision input data are of special interest given their overall higher efficiency. However, in databases, these methods have a hidden cost: the quantization of the real value into a smaller number is an expensive step. To address this issue, we present MLWeaving, a data structure and hardware acceleration technique intended to speed up learning of generalized linear models over low precision data. MLWeaving provides a compact in-memory representation that enables the retrieval of data at any level of precision. MLWeaving also provides a highly efficient implementation of stochastic gradient descent on FPGAs and enables the dynamic tuning of precision, instead of using a fixed precision level during learning. Experimental results show that MLWeaving converges up to 16 faster than low-precision implementations of first-order methods on CPUs. }, author = {Zeke Wang and Kaan Kara and and Gustavo Alonso and Onur Mutlu and Ce Zhang}, booktitle = {Proceedings of the VLDB 2019}, title = {Accelerating Generalized Linear Models with MLWeaving: A One-Size-Fits-All System for Any-precision Learning }, url = {https://dl.acm.org/doi/10.14778/3317315.3317322}, venue = {Los Angeles, CA, USA}, year = {2019} }
Proceedings of the VLDB 2019, Los Angeles, CA, USA, August 2019
The ability to perform machine learning (ML) tasks in a database
management system (DBMS) provides the data analyst with a powerful
tool. Unfortunately, integration of ML into a DBMS is challenging
for reasons varying from differences in execution model to
data layout requirements. In this paper, we assume a column-store
main-memory DBMS, optimized for online analytical processing,
as our initial system. On this system, we explore the integration of
coordinate-descent based methods working natively on columnar
format to train generalized linear models. We use a cache-efficient,
partitioned stochastic coordinate descent algorithm providing linear
throughput scalability with the number of cores while preserving
convergence quality, up to 14 cores in our experiments.
Existing column oriented DBMS rely on compression and even
encryption to store data in memory. When those features are considered,
the performance of a CPU based solution suffers. Thus,
in the paper we also show how to exploit hardware acceleration
as part of a hybrid CPU+FPGA system to provide on-the-fly data
transformation combined with an FPGA-based coordinate-descent
engine. The resulting system is a column-store DBMS with its important
features preserved (e.g., data compression) that offers high
performance machine learning capabilities.
@inproceedings{abc, abstract = {The ability to perform machine learning (ML) tasks in a database management system (DBMS) provides the data analyst with a powerful tool. Unfortunately, integration of ML into a DBMS is challenging for reasons varying from differences in execution model to data layout requirements. In this paper, we assume a column-store main-memory DBMS, optimized for online analytical processing, as our initial system. On this system, we explore the integration of coordinate-descent based methods working natively on columnar format to train generalized linear models. We use a cache-efficient, partitioned stochastic coordinate descent algorithm providing linear throughput scalability with the number of cores while preserving convergence quality, up to 14 cores in our experiments. Existing column oriented DBMS rely on compression and even encryption to store data in memory. When those features are considered, the performance of a CPU based solution suffers. Thus, in the paper we also show how to exploit hardware acceleration as part of a hybrid CPU+FPGA system to provide on-the-fly data transformation combined with an FPGA-based coordinate-descent engine. The resulting system is a column-store DBMS with its important features preserved (e.g., data compression) that offers high performance machine learning capabilities.}, author = {Kaan Kara and Ken Eguro and Ce Zhang and Gustavo Alonso}, booktitle = {Proceedings of the VLDB 2019}, title = { ColumnML: Column Store Machine Learning with On-the-Fly Data Transformation}, venue = {Los Angeles, CA, USA}, year = {2019} }
Proceedings of the NAACL-HLT 2019, Minneapolis, USA., June 2019
Previous research shows that eye-tracking data
contains information about the lexical and syntactic
properties of text, which can be used to
improve natural language processing models.
In this work, we leverage eye movement features
from three corpora with recorded gaze
information to augment a state-of-the-art neural
model for named entity recognition (NER)
with gaze embeddings. These corpora were
manually annotated with named entity labels.
Moreover, we show how gaze features, generalized
on word type level, eliminate the need
for recorded eye-tracking data at test time. The
gaze-augmented models for NER using tokenlevel
and type-level features outperform the
baselines. We present the benefits of eyetracking
features by evaluating the NER models
on both individual datasets as well as in
cross-domain settings.
@inproceedings{abc, abstract = {Previous research shows that eye-tracking data contains information about the lexical and syntactic properties of text, which can be used to improve natural language processing models. In this work, we leverage eye movement features from three corpora with recorded gaze information to augment a state-of-the-art neural model for named entity recognition (NER) with gaze embeddings. These corpora were manually annotated with named entity labels. Moreover, we show how gaze features, generalized on word type level, eliminate the need for recorded eye-tracking data at test time. The gaze-augmented models for NER using tokenlevel and type-level features outperform the baselines. We present the benefits of eyetracking features by evaluating the NER models on both individual datasets as well as in cross-domain settings.}, author = {Nora Hollenstein and Ce Zhang}, booktitle = {Proceedings of the NAACL-HLT 2019}, title = {Entity Recognition at First Sight: Improving NER with Eye Movement Information}, venue = {Minneapolis, USA.}, year = {2019} }
2017
Proceedings of the 27th International Conference on Field Programmable Logic and Applications (FPL), Ghent, Belgium, October 2017
Decision tree ensembles are commonly used in a wide range of applications and becoming the de facto algorithm for decision tree based classifiers. Different trees in an ensemble can be processed in parallel during tree inference, making them a suitable use case for FPGAs. Large tree ensembles, however, require careful mapping of trees to on-chip memory and management of memory accesses. As a result, existing FPGA solutions suffer from the inability to scale beyond tens of trees and lack the flexibility to support different tree ensembles. In this paper we present an FPGA tree ensemble classifier together with a software driver to efficiently manage the FPGA's memory resources. The classifier architecture efficiently utilizes the FPGA's resources to fit half a million tree nodes in on-chip memory, delivering up to 20× speedup over a 10-threaded CPU implementation when fully processing the tree ensemble on the FPGA. It can also combine the CPU and FPGA to scale to tree ensembles that do not fit in on-chip memory, achieving up to an order of magnitude speedup compared to a pure CPU implementation. In addition, the classifier architecture can be programmed at runtime to process varying tree ensemble sizes.
@inproceedings{abc, abstract = {Decision tree ensembles are commonly used in a wide range of applications and becoming the de facto algorithm for decision tree based classifiers. Different trees in an ensemble can be processed in parallel during tree inference, making them a suitable use case for FPGAs. Large tree ensembles, however, require careful mapping of trees to on-chip memory and management of memory accesses. As a result, existing FPGA solutions suffer from the inability to scale beyond tens of trees and lack the flexibility to support different tree ensembles. In this paper we present an FPGA tree ensemble classifier together with a software driver to efficiently manage the FPGA{\textquoteright}s memory resources. The classifier architecture efficiently utilizes the FPGA{\textquoteright}s resources to fit half a million tree nodes in on-chip memory, delivering up to 20{\texttimes} speedup over a 10-threaded CPU implementation when fully processing the tree ensemble on the FPGA. It can also combine the CPU and FPGA to scale to tree ensembles that do not fit in on-chip memory, achieving up to an order of magnitude speedup compared to a pure CPU implementation. In addition, the classifier architecture can be programmed at runtime to process varying tree ensemble sizes.}, author = {Muhsen Owaida and Hantian Zhang and Ce Zhang and Gustavo Alonso}, booktitle = {Proceedings of the 27th International Conference on Field Programmable Logic and Applications (FPL)}, title = {Scalable inference of decision tree ensembles: Flexible design for CPU-FPGA platforms}, venue = {Ghent, Belgium}, year = {2017} }
Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia, August 2017
@inproceedings{abc, author = {Hantian Zhang and Jerry Li and Kaan Kara and Dan Alistarh and Ji Liu and Ce Zhang}, booktitle = {Proceedings of the 34th International Conference on Machine Learning, ICML 2017, Sydney, NSW, Australia}, title = {ZipML: Training Linear Models with End-to-End Low Precision, and a Little Bit of Deep Learning.}, url = {http://proceedings.mlr.press/v70/zhang17e.html}, year = {2017} }
Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA, May 2017
@inproceedings{abc, author = {Jiawei Jiang and Bin Cui and Ce Zhang and Lele Yu}, booktitle = {Proceedings of the 2017 ACM International Conference on Management of Data, SIGMOD Conference 2017, Chicago, IL, USA}, title = {Heterogeneity-aware Distributed Parameter Servers.}, url = {http://doi.acm.org/10.1145/3035918.3035933}, year = {2017} }
Proceedings of the 2nd Workshop on Human-In-the-Loop Data Analytics, HILDA@SIGMOD 2017, Chicago, IL, USA, May 2017
@inproceedings{abc, author = {Ce Zhang and Wentao Wu and Tian Li}, booktitle = {Proceedings of the 2nd Workshop on Human-In-the-Loop Data Analytics, HILDA@SIGMOD 2017, Chicago, IL, USA}, title = {An Overreaction to the Broken Machine Learning Abstraction: The ease.ml Vision.}, url = {http://doi.acm.org/10.1145/3077257.3077265}, year = {2017} }
33rd IEEE International Conference on Data Engineering, ICDE 2017, San Diego, CA, USA, April 2017
@inproceedings{abc, author = {Jie Jiang and Jiawei Jiang and Bin Cui and Ce Zhang}, booktitle = {33rd IEEE International Conference on Data Engineering, ICDE 2017, San Diego, CA, USA}, title = {TencentBoost: A Gradient Boosting Tree System with Parameter Server.}, url = {https://doi.org/10.1109/ICDE.2017.87}, year = {2017} }
25th IEEE Annual International Symposium on Field-Programmable Custom Computing Machines, FCCM 2017, Napa, CA, USA, Napa, CA, USA, April 2017
Stochastic gradient descent (SGD) is a commonly used algorithm for training linear machine learning models. Based on vector algebra, it benefits from the inherent parallelism available in an FPGA. In this paper, we first present a single-precision floating-point SGD implementation on an FPGA that provides similar performance as a 10-core CPU. We then adapt the design to make it capable of processing low-precision data. The low-precision data is obtained from a novel compression scheme-called stochastic quantization, specifically designed for machine learning applications. We test both full-precision and low-precision designs on various regression and classification data sets. We achieve up to an order of magnitude training speedup when using low-precision data compared to a full-precision SGD on the same FPGA and a state-of-the-art multi-core solution, while maintaining the quality of training. We open source the designs presented in this paper.
@inproceedings{abc, abstract = {Stochastic gradient descent (SGD) is a commonly used algorithm for training linear machine learning models. Based on vector algebra, it benefits from the inherent parallelism available in an FPGA. In this paper, we first present a single-precision floating-point SGD implementation on an FPGA that provides similar performance as a 10-core CPU. We then adapt the design to make it capable of processing low-precision data. The low-precision data is obtained from a novel compression scheme-called stochastic quantization, specifically designed for machine learning applications. We test both full-precision and low-precision designs on various regression and classification data sets. We achieve up to an order of magnitude training speedup when using low-precision data compared to a full-precision SGD on the same FPGA and a state-of-the-art multi-core solution, while maintaining the quality of training. We open source the designs presented in this paper.}, author = {Kaan Kara and Dan Alistarh and Gustavo Alonso and Onur Mutlu and Ce Zhang}, booktitle = {25th IEEE Annual International Symposium on Field-Programmable Custom Computing Machines, FCCM 2017, Napa, CA, USA}, title = {FPGA-Accelerated Dense Linear Machine Learning: A Precision-Convergence Trade-Off.}, url = {https://doi.org/10.1109/FCCM.2017.39}, venue = {Napa, CA, USA}, year = {2017} }
PVLDB, January 2017
@inproceedings{abc, author = {Zhipeng Zhang and Yingxia Shao and Bin Cui and Ce Zhang}, booktitle = {PVLDB}, title = {An Experimental Evaluation of SimRank-based Similarity Search Algorithms.}, url = {http://www.vldb.org/pvldb/vol10/p601-zhang.pdf}, year = {2017} }
VLDB J., January 2017
Populating a database with unstructured information is a long-standing problem in industry and research that encompasses problems of extraction, cleaning, and integration. Recent names used for this problem include dealing with dark data and knowledge base construction (KBC). In this work, we describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems, and we present techniques to make the KBC process more efficient. We observe that the KBC process is iterative, and we develop techniques to incrementally produce inference results for KBC systems. We propose two methods for incremental inference, based respectively on sampling and variational techniques. We also study the tradeoff space of these methods and develop a simple rule-based optimizer. DeepDive includes all of these contributions, and we evaluate DeepDive on five KBC systems, showing that it can speed up KBC inference tasks by up to two orders of magnitude with negligible impact on quality.
@article{abc, abstract = {Populating a database with unstructured information is a long-standing problem in industry and research that encompasses problems of extraction, cleaning, and integration. Recent names used for this problem include dealing with dark data and knowledge base construction (KBC). In this work, we describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems, and we present techniques to make the KBC process more efficient. We observe that the KBC process is iterative, and we develop techniques to incrementally produce inference results for KBC systems. We propose two methods for incremental inference, based respectively on sampling and variational techniques. We also study the tradeoff space of these methods and develop a simple rule-based optimizer. DeepDive includes all of these contributions, and we evaluate DeepDive on five KBC systems, showing that it can speed up KBC inference tasks by up to two orders of magnitude with negligible impact on quality.}, author = {Christopher De Sa and Alexander Ratner and Christopher R{\'e} and Jaeho Shin and Feiran Wang and Sen Wu and Ce Zhang}, journal = {VLDB J.}, title = {Incremental knowledge base construction using DeepDive.}, url = {http://dx.doi.org/10.1007/s00778-016-0437-2}, year = {2017} }
CoRR, January 2017
@article{abc, author = {Kevin Schawinski and Ce Zhang and Hantian Zhang and Lucas Fowler and Gokula Krishnan Santhanam}, journal = {CoRR}, title = {Generative Adversarial Networks recover features in astrophysical images of galaxies beyond the deconvolution limit.}, url = {http://arxiv.org/abs/1702.00403}, year = {2017} }
Commun. ACM, -, January 2017
The dark data extraction or knowledge base construction (KBC) problem is to populate a SQL database with information from unstructured data sources including emails, webpages, and pdf reports. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems. The key idea in DeepDive is that statistical inference and machine learning are key tools to attack classical data problems in extraction, cleaning, and integration in a unified and more effective manner. DeepDive programs are declarative in that one cannot write probabilistic inference algorithms; instead, one interacts by defining features or rules about the domain. A key reason for this design choice is to enable domain experts to build their own KBC systems. We present the applications, abstractions, and techniques of DeepDive employed to accelerate construction of KBC systems.
@inproceedings{abc, abstract = {The dark data extraction or knowledge base construction (KBC) problem is to populate a SQL database with information from unstructured data sources including emails, webpages, and pdf reports. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems. The key idea in DeepDive is that statistical inference and machine learning are key tools to attack classical data problems in extraction, cleaning, and integration in a unified and more effective manner. DeepDive programs are declarative in that one cannot write probabilistic inference algorithms; instead, one interacts by defining features or rules about the domain. A key reason for this design choice is to enable domain experts to build their own KBC systems. We present the applications, abstractions, and techniques of DeepDive employed to accelerate construction of KBC systems.}, author = {Ce Zhang and Christopher R{\'e} and Michael J. Cafarella and Jaeho Shin and Feiran Wang and Sen Wu}, booktitle = {Commun. ACM}, title = {DeepDive: declarative knowledge base construction.}, url = {http://doi.acm.org/10.1145/3060586}, venue = {-}, year = {2017} }
CoRR, January 2017
@inproceedings{abc, author = {Heng Guo and Kaan Kara and Ce Zhang}, booktitle = {CoRR}, title = {Layerwise Systematic Scan: Deep Boltzmann Machines and Beyond.}, url = {http://arxiv.org/abs/1705.05154}, year = {2017} }
CoRR, January 2017
@inproceedings{abc, author = {Xiangru Lian and Ce Zhang and Huan Zhang and Cho-Jui Hsieh and Wei Zhang and Ji Liu}, booktitle = {CoRR}, title = {Can Decentralized Algorithms Outperform Centralized Algorithms? A Case Study for Decentralized Parallel Stochastic Gradient Descent.}, url = {http://arxiv.org/abs/1705.09056}, year = {2017} }
2016
2016 IEEE International Conference on Big Data, BigData 2016, Washington DC, USA, December 2016
@inproceedings{abc, author = {Heqing Huang and Cong Zheng and Junyuan Zeng and Wu Zhou and Sencun Zhu and Peng Liu and Suresh Chari and Ce Zhang}, booktitle = {2016 IEEE International Conference on Big Data, BigData 2016, Washington DC, USA}, title = {Android malware development on public malware scanning platforms: A large-scale data-driven study.}, url = {http://dx.doi.org/10.1109/BigData.2016.7840712}, year = {2016} }
54th Annual Allerton Conference on Communication, Control, and Computing, Allerton 2016, Monticello, IL, USA, September 2016
@inproceedings{abc, author = {Ioannis Mitliagkas and Ce Zhang and Stefan Hadjis and Christopher R{\'e}}, booktitle = {54th Annual Allerton Conference on Communication, Control, and Computing, Allerton 2016, Monticello, IL, USA}, title = {Asynchrony begets momentum, with an application to deep learning.}, url = {http://dx.doi.org/10.1109/ALLERTON.2016.7852343}, year = {2016} }
Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016, San Francisco, CA, USA, June 2016
DeepDive is a system for extracting relational databases from dark data: the mass of text, tables, and images that are widely collected and stored but which cannot be exploited by standard relational tools. If the information in dark data - scientific papers, Web classified ads, customer service notes, and so on - were instead in a relational database, it would give analysts a massive and valuable new set of "big data." DeepDive is distinctive when compared to previous information extraction systems in its ability to obtain very high precision and recall at reasonable engineering cost; in a number of applications, we have used DeepDive to create databases with accuracy that meets that of human annotators. To date we have successfully deployed DeepDive to create data-centric applications for insurance, materials science, genomics, paleontologists, law enforcement, and others. The data unlocked by DeepDive represents a massive opportunity for industry, government, and scientific researchers. DeepDive is enabled by an unusual design that combines large-scale probabilistic inference with a novel developer interaction cycle. This design is enabled by several core innovations around probabilistic training and inference.
@inproceedings{abc, abstract = {DeepDive is a system for extracting relational databases from dark data: the mass of text, tables, and images that are widely collected and stored but which cannot be exploited by standard relational tools. If the information in dark data - scientific papers, Web classified ads, customer service notes, and so on - were instead in a relational database, it would give analysts a massive and valuable new set of "big data." DeepDive is distinctive when compared to previous information extraction systems in its ability to obtain very high precision and recall at reasonable engineering cost; in a number of applications, we have used DeepDive to create databases with accuracy that meets that of human annotators. To date we have successfully deployed DeepDive to create data-centric applications for insurance, materials science, genomics, paleontologists, law enforcement, and others. The data unlocked by DeepDive represents a massive opportunity for industry, government, and scientific researchers. DeepDive is enabled by an unusual design that combines large-scale probabilistic inference with a novel developer interaction cycle. This design is enabled by several core innovations around probabilistic training and inference.}, author = {Ce Zhang and Jaeho Shin and Christopher R{\'e} and Michael J. Cafarella and Feng Niu}, booktitle = {Proceedings of the 2016 International Conference on Management of Data, SIGMOD Conference 2016}, title = {Extracting Databases from Dark Data with DeepDive.}, url = {http://doi.acm.org/10.1145/2882903.2904442}, venue = {San Francisco, CA, USA}, year = {2016} }
ETH Zürich, January 2016
@inproceedings{abc, author = {Ioannis Mitliagkas and Ce Zhang and Stefan Hadjis and Christopher R{\'e}}, booktitle = {ETH Z{\"u}rich}, title = {Asynchrony begets Momentum, with an Application to Deep Learning}, year = {2016} }
Bioinformatics, January 2016
@article{abc, author = {Emily K. Mallory and Ce Zhang and Christopher R{\'e} and Russ B. Altman}, journal = {Bioinformatics}, title = {Large-scale extraction of gene interactions from full-text literature using DeepDive.}, url = {http://dx.doi.org/10.1093/bioinformatics/btv476}, year = {2016} }
ACM Trans. Database Syst., January 2016
@article{abc, author = {Ce Zhang and Arun Kumar and Christopher R{\'e}}, journal = {ACM Trans. Database Syst.}, title = {Materialization Optimizations for Feature Selection Workloads.}, url = {http://doi.acm.org/10.1145/2877204}, year = {2016} }
CoRR, January 2016
@article{abc, author = {Ioannis Mitliagkas and Ce Zhang and Stefan Hadjis and Christopher R{\'e}}, journal = {CoRR}, title = {Asynchrony begets Momentum, with an Application to Deep Learning.}, url = {http://arxiv.org/abs/1605.09774}, year = {2016} }
CoRR, January 2016
@article{abc, author = {Xinghao Pan and Maximilian Lam and Stephen Tu and Dimitris S. Papailiopoulos and Ce Zhang and Michael I. Jordan and Kannan Ramchandran and Christopher R{\'e} and Benjamin Recht}, journal = {CoRR}, title = {CYCLADES: Conflict-free Asynchronous Machine Learning.}, url = {http://arxiv.org/abs/1605.09721}, year = {2016} }
SIGMOD Record, January 2016
The dark data extraction or knowledge base construction (KBC) problem is to populate a SQL database with information from unstructured data sources including emails, webpages, and pdf reports. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems. The key idea in DeepDive is that statistical inference and machine learning are key tools to attack classical data problems in extraction, cleaning, and integration in a unified and more effective manner. DeepDive programs are declarative in that one cannot write probabilistic inference algorithms; instead, one interacts by defining features or rules about the domain. A key reason for this design choice is to enable domain experts to build their own KBC systems. We present the applications, abstractions, and techniques of DeepDive employed to accelerate construction of KBC systems.
@article{abc, abstract = {The dark data extraction or knowledge base construction (KBC) problem is to populate a SQL database with information from unstructured data sources including emails, webpages, and pdf reports. KBC is a long-standing problem in industry and research that encompasses problems of data extraction, cleaning, and integration. We describe DeepDive, a system that combines database and machine learning ideas to help develop KBC systems. The key idea in DeepDive is that statistical inference and machine learning are key tools to attack classical data problems in extraction, cleaning, and integration in a unified and more effective manner. DeepDive programs are declarative in that one cannot write probabilistic inference algorithms; instead, one interacts by defining features or rules about the domain. A key reason for this design choice is to enable domain experts to build their own KBC systems. We present the applications, abstractions, and techniques of DeepDive employed to accelerate construction of KBC systems.}, author = {Christopher De Sa and Alexander Ratner and Christopher R{\'e} and Jaeho Shin and Feiran Wang and Sen Wu and Ce Zhang}, journal = {SIGMOD Record}, title = {DeepDive: Declarative Knowledge Base Construction.}, url = {http://doi.acm.org/10.1145/2949741.2949756}, year = {2016} }
CoRR, January 2016
@article{abc, author = {Stefan Hadjis and Ce Zhang and Ioannis Mitliagkas and Christopher R{\'e}}, journal = {CoRR}, title = {Omnivore: An Optimizer for Multi-device Deep Learning on CPUs and GPUs.}, url = {http://arxiv.org/abs/1606.04487}, year = {2016} }
CoRR, January 2016
We present ZipML, the first framework for training dense generalized linear models using end-to-end low-precision representation--in ZipML, all movements of data, including those for input samples, model, and gradients, are represented using as little as two bits per component. Within our framework, we have successfully compressed, separately, the input data by 16x, gradient by 16x, and model by 16x while still getting the same training result. Even for the most challenging datasets, we find that robust convergence can be ensured using only an end-to-end 8-bit representation or a 6-bit representation if only samples are quantized. Our work builds on previous research on using low-precision representations for gradient and model in the context of stochastic gradient descent. Our main technical contribution is a new set of techniques which allow the training samples to be processed with low precision, without affecting the convergence of the algorithm. In turn, this leads to a system where all data items move in a quantized, low precision format. In particular, we first establish that randomized rounding, while sufficient when quantizing the model and the gradients, is biased when quantizing samples, and thus leads to a different training result. We propose two new data representations which converge to the same solution as in the original data representation both in theory and empirically and require as little as 2-bits per component. As a result, if the original data is stored as 32-bit floats, we decrease the bandwidth footprint for each training iteration by up to 16x. Our results hold for models such as linear regression and least squares SVM. ZipML raises interesting theoretical questions related to the robustness of SGD to approximate data, model, and gradient representations. We conclude this working paper by a description of ongoing work extending these preliminary results.
@article{abc, abstract = {We present ZipML, the first framework for training dense generalized linear models using end-to-end low-precision representation--in ZipML, all movements of data, including those for input samples, model, and gradients, are represented using as little as two bits per component. Within our framework, we have successfully compressed, separately, the input data by 16x, gradient by 16x, and model by 16x while still getting the same training result. Even for the most challenging datasets, we find that robust convergence can be ensured using only an end-to-end 8-bit representation or a 6-bit representation if only samples are quantized. Our work builds on previous research on using low-precision representations for gradient and model in the context of stochastic gradient descent. Our main technical contribution is a new set of techniques which allow the training samples to be processed with low precision, without affecting the convergence of the algorithm. In turn, this leads to a system where all data items move in a quantized, low precision format. In particular, we first establish that randomized rounding, while sufficient when quantizing the model and the gradients, is biased when quantizing samples, and thus leads to a different training result. We propose two new data representations which converge to the same solution as in the original data representation both in theory and empirically and require as little as 2-bits per component. As a result, if the original data is stored as 32-bit floats, we decrease the bandwidth footprint for each training iteration by up to 16x. Our results hold for models such as linear regression and least squares SVM. ZipML raises interesting theoretical questions related to the robustness of SGD to approximate data, model, and gradient representations. We conclude this working paper by a description of ongoing work extending these preliminary results. }, author = {Hantian Zhang and Kaan Kara and Jerry Li and Dan Alistarh and Ji Liu and Ce Zhang}, journal = {CoRR}, title = {ZipML: An End-to-end Bitwise Framework for Dense Generalized Linear Models.}, url = {http://arxiv.org/abs/1611.05402}, year = {2016} }
2015
Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, Montreal, Quebec, Canada, December 2015
@inproceedings{abc, author = {Christopher De Sa and Ce Zhang and Kunle Olukotun and Christopher R{\'e}}, booktitle = {Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015}, title = {Rapidly Mixing Gibbs Sampling for a Class of Factor Graphs Using Hierarchy Width.}, url = {http://papers.nips.cc/paper/5757-rapidly-mixing-gibbs-sampling-for-a-class-of-factor-graphs-using-hierarchy-width}, venue = {Montreal, Quebec, Canada}, year = {2015} }
Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015, Montreal, Quebec, Canada, December 2015
@inproceedings{abc, author = {Christopher De Sa and Ce Zhang and Kunle Olukotun and Christopher R{\'e}}, booktitle = {Advances in Neural Information Processing Systems 28: Annual Conference on Neural Information Processing Systems 2015}, title = {Taming the Wild: A Unified Analysis of Hogwild-Style Algorithms.}, url = {http://papers.nips.cc/paper/5717-taming-the-wild-a-unified-analysis-of-hogwild-style-algorithms}, venue = {Montreal, Quebec, Canada}, year = {2015} }
Proceedings of the Fourth Workshop on Data analytics in the Cloud, DanaC 2015, Melbourne, VIC, Australia, May 2015
@inproceedings{abc, author = {Stefan Hadjis and Firas Abuzaid and Ce Zhang and Christopher R{\'e}}, booktitle = {Proceedings of the Fourth Workshop on Data analytics in the Cloud, DanaC 2015, Melbourne, VIC, Australia}, title = {Caffe con Troll: Shallow Ideas to Speed Up Deep Learning.}, url = {http://doi.acm.org/10.1145/2799562.2799641}, year = {2015} }
CoRR, January 2015
@article{abc, author = {Sen Wu and Ce Zhang and Feiran Wang and Christopher R{\'e}}, journal = {CoRR}, title = {Incremental Knowledge Base Construction Using DeepDive.}, url = {http://arxiv.org/abs/1502.00731}, year = {2015} }
CoRR, January 2015
@article{abc, author = {Firas Abuzaid and Stefan Hadjis and Ce Zhang and Christopher R{\'e}}, journal = {CoRR}, title = {Caffe con Troll: Shallow Ideas to Speed Up Deep Learning.}, url = {http://arxiv.org/abs/1504.04343}, year = {2015} }
CoRR, January 2015
@article{abc, author = {Christopher De Sa and Ce Zhang and Kunle Olukotun and Christopher R{\'e}}, journal = {CoRR}, title = {Taming the Wild: A Unified Analysis of Hogwild!-Style Algorithms.}, url = {http://arxiv.org/abs/1506.06438}, year = {2015} }
PVLDB, January 2015
@inproceedings{abc, author = {Jaeho Shin and Sen Wu and Feiran Wang and Christopher De Sa and Ce Zhang and Christopher R{\'e}}, booktitle = {PVLDB}, title = {Incremental Knowledge Base Construction Using DeepDive.}, url = {http://www.vldb.org/pvldb/vol8/p1310-shin.pdf}, year = {2015} }
CoRR, January 2015
@article{abc, author = {Yuke Zhu and Ce Zhang and Christopher R{\'e} and Li Fei-Fei}, journal = {CoRR}, title = {Building a Large-scale Multimodal Knowledge Base for Visual Question Answering.}, url = {http://arxiv.org/abs/1507.05670}, year = {2015} }
CoRR, January 2015
@article{abc, author = {Christopher De Sa and Ce Zhang and Kunle Olukotun and Christopher R{\'e}}, journal = {CoRR}, title = {Rapidly Mixing Gibbs Sampling for a Class of Factor Graphs Using Hierarchy Width.}, url = {http://arxiv.org/abs/1510.00756}, year = {2015} }
2014
Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014, Montreal, Quebec, Canada, December 2014
@inproceedings{abc, author = {Yingbo Zhou and Utkarsh Porwal and Ce Zhang and Hung Q. Ngo and Long Nguyen and Christopher R{\'e} and Venu Govindaraju}, booktitle = {Advances in Neural Information Processing Systems 27: Annual Conference on Neural Information Processing Systems 2014}, title = {Parallel Feature Selection Inspired by Group Testing.}, url = {http://papers.nips.cc/paper/5296-parallel-feature-selection-inspired-by-group-testing}, venue = {Montreal, Quebec, Canada}, year = {2014} }
Proceedings of the 2nd International Workshop on In Memory Data Management and Analytics, IMDM 2014, Hangzhou, China, September 2014
@inproceedings{abc, author = {Victor Bittorf and Marcel Kornacker and Christopher R{\'e} and Ce Zhang}, booktitle = {Proceedings of the 2nd International Workshop on In Memory Data Management and Analytics, IMDM 2014, Hangzhou, China}, title = {Tradeoffs in Main-Memory Statistical Analytics from Impala to DimmWitted.}, url = {http://www-db.in.tum.de/hosted/imdm2014/papers/bittorf.pdf}, year = {2014} }
International Conference on Management of Data, SIGMOD 2014, Snowbird, UT, USA, June 2014
@inproceedings{abc, author = {Ce Zhang and Arun Kumar and Christopher R{\'e}}, booktitle = {International Conference on Management of Data, SIGMOD 2014, Snowbird, UT, USA}, title = {Materialization optimizations for feature selection workloads.}, url = {http://doi.acm.org/10.1145/2588555.2593678}, year = {2014} }
CoRR, January 2014
@article{abc, author = {Ce Zhang and Christopher R{\'e}}, journal = {CoRR}, title = {DimmWitted: A Study of Main-Memory Statistical Analytics.}, url = {http://arxiv.org/abs/1403.7550}, year = {2014} }
CoRR, January 2014
@article{abc, author = {Shanan Peters and Ce Zhang and Miron Livny and Christopher R{\'e}}, journal = {CoRR}, title = {A machine-compiled macroevolutionary history of Phanerozoic life.}, url = {http://arxiv.org/abs/1406.2963}, year = {2014} }
CoRR, January 2014
@article{abc, author = {Ce Zhang and Christopher R{\'e} and Amir Abbas Sadeghian and Zifei Shan and Jaeho Shin and Feiran Wang and Sen Wu}, journal = {CoRR}, title = {Feature Engineering for Knowledge Base Construction.}, url = {http://arxiv.org/abs/1407.6439}, year = {2014} }
PVLDB, January 2014
@inproceedings{abc, author = {Ce Zhang and Christopher R{\'e}}, booktitle = {PVLDB}, title = {DimmWitted: A Study of Main-Memory Statistical Analytics.}, url = {http://www.vldb.org/pvldb/vol7/p1283-zhang.pdf}, year = {2014} }
IEEE Data Eng. Bull., January 2014
@inproceedings{abc, author = {Christopher R{\'e} and Amir Abbas Sadeghian and Zifei Shan and Jaeho Shin and Feiran Wang and Sen Wu and Ce Zhang}, booktitle = {IEEE Data Eng. Bull.}, title = {Feature Engineering for Knowledge Base Construction.}, url = {http://sites.computer.org/debull/A14sept/p26.pdf}, year = {2014} }
2013
Advances in Neural Information Processing Systems 26: 27th Annual Conference on Neural Information Processing Systems 2013. Proceedings of a meeting held December 5-8, 2013, Lake Tahoe, Nevada, United States., Lake Tahoe, Nevada, United States., December 2013
@inproceedings{abc, author = {Srikrishna Sridhar and Stephen J. Wright and Christopher R{\'e} and Ji Liu and Victor Bittorf and Ce Zhang}, booktitle = {Advances in Neural Information Processing Systems 26: 27th Annual Conference on Neural Information Processing Systems 2013. Proceedings of a meeting held December 5-8, 2013, Lake Tahoe, Nevada, United States.}, title = {An Approximate, Efficient LP Solver for LP Rounding.}, url = {http://papers.nips.cc/paper/4990-an-approximate-efficient-lp-solver-for-lp-rounding}, venue = {Lake Tahoe, Nevada, United States.}, year = {2013} }
Proceedings of The Twenty-Second Text REtrieval Conference, TREC 2013, Gaithersburg, Maryland, USA, November 2013
@inproceedings{abc, author = {John R. Frank and Steven J. Bauer and Max Kleiman-Weiner and Daniel A. Roberts and Nilesh Tripuraneni and Ce Zhang and Christopher R{\'e} and Ellen M. Voorhees and Ian Soboroff}, booktitle = {Proceedings of The Twenty-Second Text REtrieval Conference, TREC 2013, Gaithersburg, Maryland, USA}, title = {Evaluating Stream Filtering for Entity Profile Updates for TREC 2013.}, url = {http://trec.nist.gov/pubs/trec22/papers/KBA.OVERVIEW.pdf}, year = {2013} }
Proceedings of The Twenty-Second Text REtrieval Conference, TREC 2013, Gaithersburg, Maryland, USA, November 2013
@inproceedings{abc, author = {Tushar Khot and Ce Zhang and Jude W. Shavlik and Sriraam Natarajan and Christopher R{\'e}}, booktitle = {Proceedings of The Twenty-Second Text REtrieval Conference, TREC 2013, Gaithersburg, Maryland, USA}, title = {Bootstrapping Knowledge Base Acceleration.}, url = {http://trec.nist.gov/pubs/trec22/papers/wisc-kba.pdf}, year = {2013} }
Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics, ACL 2013, Sofia, Bulgaria, Volume 2: Short Papers, August 2013
@inproceedings{abc, author = {Vidhya Govindaraju and Ce Zhang and Christopher R{\'e}}, booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics, ACL 2013}, title = {Understanding Tables in Context Using Standard NLP Toolkits.}, url = {http://aclweb.org/anthology/P/P13/P13-2116.pdf}, venue = {Sofia, Bulgaria, Volume 2: Short Papers}, year = {2013} }
Proceedings of the ACM SIGMOD International Conference on Management of Data, SIGMOD 2013, New York, NY, USA, June 2013
@inproceedings{abc, author = {Ce Zhang and Christopher R{\'e}}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data, SIGMOD 2013, New York, NY, USA}, title = {Towards high-throughput gibbs sampling at scale: a study across storage managers.}, url = {http://doi.acm.org/10.1145/2463676.2463702}, year = {2013} }
Proceedings of the ACM SIGMOD International Conference on Management of Data, SIGMOD 2013, New York, NY, USA, June 2013
@inproceedings{abc, author = {Ce Zhang and Vidhya Govindaraju and Jackson Borchardt and Tim Foltz and Christopher R{\'e} and Shanan Peters}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data, SIGMOD 2013, New York, NY, USA}, title = {GeoDeepDive: statistical inference using familiar data-processing languages.}, url = {http://doi.acm.org/10.1145/2463676.2463680}, year = {2013} }
CoRR, -, January 2013
Many problems in machine learning can be solved by rounding the solution of an appropriate linear program (LP). This paper shows that we can recover solutions of comparable quality by rounding an approximate LP solution instead of the ex- act one. These approximate LP solutions can be computed efficiently by applying a parallel stochastic-coordinate-descent method to a quadratic-penalty formulation of the LP. We derive worst-case runtime and solution quality guarantees of this scheme using novel perturbation and convergence analysis. Our experiments demonstrate that on such combinatorial problems as vertex cover, independent set and multiway-cut, our approximate rounding scheme is up to an order of magnitude faster than Cplex (a commercial LP solver) while producing solutions of similar quality.
@inproceedings{abc, abstract = {Many problems in machine learning can be solved by rounding the solution of an appropriate linear program (LP). This paper shows that we can recover solutions of comparable quality by rounding an approximate LP solution instead of the ex- act one. These approximate LP solutions can be computed efficiently by applying a parallel stochastic-coordinate-descent method to a quadratic-penalty formulation of the LP. We derive worst-case runtime and solution quality guarantees of this scheme using novel perturbation and convergence analysis. Our experiments demonstrate that on such combinatorial problems as vertex cover, independent set and multiway-cut, our approximate rounding scheme is up to an order of magnitude faster than Cplex (a commercial LP solver) while producing solutions of similar quality.}, author = {Srikrishna Sridhar and Victor Bittorf and Ji Liu and Ce Zhang and Christopher R{\'e} and Stephen J. Wright}, booktitle = {CoRR}, title = {An Approximate, Efficient Solver for LP Rounding.}, url = {http://arxiv.org/abs/1311.2661}, venue = {-}, year = {2013} }
CIDR 2013, Sixth Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA, January 2013
@inproceedings{abc, author = {Michael Anderson and Dolan Antenucci and Victor Bittorf and Matthew Burgess and Michael J. Cafarella and Arun Kumar and Feng Niu and Yongjoo Park and Christopher R{\'e} and Ce Zhang}, booktitle = {CIDR 2013, Sixth Biennial Conference on Innovative Data Systems Research, Asilomar, CA, USA}, title = {Brainwash: A Data System for Feature Engineering.}, url = {http://www.cidrdb.org/cidr2013/Papers/CIDR13_Paper82.pdf}, year = {2013} }
2012
12th IEEE International Conference on Data Mining, ICDM 2012, Brussels, Belgium, December 2012
@inproceedings{abc, author = {Feng Niu and Ce Zhang and Christopher R{\'e} and Jude W. Shavlik}, booktitle = {12th IEEE International Conference on Data Mining, ICDM 2012, Brussels, Belgium}, title = {Scaling Inference for Markov Logic via Dual Decomposition.}, url = {http://dx.doi.org/10.1109/ICDM.2012.96}, year = {2012} }
Proceedings of The Twenty-First Text REtrieval Conference, TREC 2012, Gaithersburg, Maryland, USA, November 2012
@inproceedings{abc, author = {John R. Frank and Max Kleiman-Weiner and Daniel A. Roberts and Feng Niu and Ce Zhang and Christopher R{\'e} and Ian Soboroff}, booktitle = {Proceedings of The Twenty-First Text REtrieval Conference, TREC 2012, Gaithersburg, Maryland, USA}, title = {Building an Entity-Centric Stream Filtering Test Collection for TREC 2012.}, url = {http://trec.nist.gov/pubs/trec21/papers/KBA.OVERVIEW.pdf}, year = {2012} }
Information Computing and Applications - Third International Conference, ICICA 2012, Chengde, China, September 2012
@inproceedings{abc, author = {Ce Zhang and Gang Cui and Bin Jin and Liang Wang}, booktitle = {Information Computing and Applications - Third International Conference, ICICA 2012, Chengde, China}, title = {Study of Trustworthiness Measurement and Kernel Modules Accessing Address Space of Any Process.}, url = {http://dx.doi.org/10.1007/978-3-642-34062-8_56}, year = {2012} }
Proceedings of the Second International Workshop on Searching and Integrating New Web Data Sources, Istanbul, Turkey, August 2012
@inproceedings{abc, author = {Feng Niu and Ce Zhang and Christopher R{\'e} and Jude W. Shavlik}, booktitle = {Proceedings of the Second International Workshop on Searching and Integrating New Web Data Sources, Istanbul, Turkey}, title = {DeepDive: Web-scale Knowledge-base Construction using Statistical Learning and Inference.}, url = {http://ceur-ws.org/Vol-884/VLDS2012_p25_Niu.pdf}, year = {2012} }
The 50th Annual Meeting of the Association for Computational Linguistics, Proceedings of the Conference, Jeju Island, Korea - Volume 1: Long Papers, July 2012
@inproceedings{abc, author = {Ce Zhang and Feng Niu and Christopher R{\'e} and Jude W. Shavlik}, booktitle = {The 50th Annual Meeting of the Association for Computational Linguistics, Proceedings of the Conference}, title = {Big Data versus the Crowd: Looking for Relationships in All the Right Places.}, url = {http://www.aclweb.org/anthology/P12-1087}, venue = {Jeju Island, Korea - Volume 1: Long Papers}, year = {2012} }
Int. J. Semantic Web Inf. Syst., January 2012
@inproceedings{abc, author = {Feng Niu and Ce Zhang and Christopher R{\'e} and Jude W. Shavlik}, booktitle = {Int. J. Semantic Web Inf. Syst.}, title = {Elementary: Large-Scale Knowledge-Base Construction via Machine Learning and Statistical Inference.}, url = {http://dx.doi.org/10.4018/jswis.2012070103}, year = {2012} }
2011
Database Systems for Advanced Applications - 16th International Conference, DASFAA 2011, Hong Kong, China, April 2011
@inproceedings{abc, author = {Junjie Yao and Bin Cui and Qiaosha Han and Ce Zhang and Yanhong Zhou}, booktitle = {Database Systems for Advanced Applications - 16th International Conference, DASFAA 2011, Hong Kong, China}, title = {Modeling User Expertise in Folksonomies by Fusing Multi-type Features.}, url = {http://dx.doi.org/10.1007/978-3-642-20149-3_6}, year = {2011} }
CoRR, January 2011
@article{abc, author = {Feng Niu and Ce Zhang and Christopher R{\'e} and Jude W. Shavlik}, journal = {CoRR}, title = {Felix: Scaling Inference for Markov Logic with an Operator-based Approach}, url = {http://arxiv.org/abs/1108.0294}, year = {2011} }
2010
Proceeding of the 33rd International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR 2010, Geneva, Switzerland, July 2010
@inproceedings{abc, author = {Bin Cui and Ce Zhang and Gao Cong}, booktitle = {Proceeding of the 33rd International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR 2010, Geneva, Switzerland}, title = {Content-enriched classifier for web video classification.}, url = {http://doi.acm.org/10.1145/1835449.1835553}, year = {2010} }
Proceedings of the ACM SIGMOD International Conference on Management of Data, SIGMOD 2010, Indianapolis, Indiana, USA, June 2010
@inproceedings{abc, author = {Bin Cui and Anthony K. H. Tung and Ce Zhang and Zhe Zhao}, booktitle = {Proceedings of the ACM SIGMOD International Conference on Management of Data, SIGMOD 2010, Indianapolis, Indiana, USA}, title = {Multiple feature fusion for social media applications.}, url = {http://doi.acm.org/10.1145/1807167.1807216}, year = {2010} }
2009
Database Systems for Advanced Applications, 14th International Conference, DASFAA 2009, Brisbane, Australia, January 2009
@inproceedings{abc, author = {Ce Zhang and Bin Cui and Gao Cong and Yu-Jing Wang}, booktitle = {Database Systems for Advanced Applications, 14th International Conference, DASFAA 2009, Brisbane, Australia}, title = {A Revisit of Query Expansion with Different Semantic Levels.}, url = {http://dx.doi.org/10.1007/978-3-642-00887-0_58}, year = {2009} }
Database Systems for Advanced Applications, 14th International Conference, DASFAA 2009, Brisbane, Australia, January 2009
@inproceedings{abc, author = {Bin Cui and Bei Pan and Heng Tao Shen and Ying Wang and Ce Zhang}, booktitle = {Database Systems for Advanced Applications, 14th International Conference, DASFAA 2009, Brisbane, Australia}, title = {Video Annotation System Based on Categorizing and Keyword Labelling.}, url = {http://dx.doi.org/10.1007/978-3-642-00887-0_68}, year = {2009} }
Proceedings of the 18th ACM Conference on Information and Knowledge Management, CIKM 2009, Hong Kong, China, January 2009
@inproceedings{abc, author = {Xin Cao and Gao Cong and Bin Cui and Christian S. Jensen and Ce Zhang}, booktitle = {Proceedings of the 18th ACM Conference on Information and Knowledge Management, CIKM 2009, Hong Kong, China}, title = {The use of categorization information in language models for question retrieval.}, url = {http://doi.acm.org/10.1145/1645953.1645989}, year = {2009} }
2008
Proceedings of the 17th International Conference on World Wide Web, WWW 2008, Beijing, China, January 2008
@inproceedings{abc, author = {Ce Zhang and Yu-Jing Wang and Bin Cui and Gao Cong}, booktitle = {Proceedings of the 17th International Conference on World Wide Web, WWW 2008, Beijing, China}, title = {Semantic similarity based on compact concept ontology.}, url = {http://doi.acm.org/10.1145/1367497.1367688}, year = {2008} }