Publications by Irina Botan
2013
VLDB J., January 2013
@inproceedings{abc, author = {Nihal Dindar and Nesime Tatbul and Ren{\'e}e J. Miller and Laura M. Haas and Irina Botan}, booktitle = {VLDB J.}, title = {Modeling the execution semantics of stream processing engines with SECRET.}, url = {http://dx.doi.org/10.1007/s00778-012-0297-3}, year = {2013} }
2012
15th International Conference on Extending Database Technology, EDBT '12, Berlin, Germany, January 2012
@inproceedings{abc, author = {Irina Botan and Peter M. Fischer and Donald Kossmann and Nesime Tatbul}, booktitle = {15th International Conference on Extending Database Technology, EDBT {\textquoteright}12, Berlin, Germany}, title = {Transactional Stream Processing}, url = {http://doi.acm.org/10.1145/2247596.2247622}, year = {2012} }
2011
ETH Zürich, Diss. Nr. 19694, May 2011
Supervised by: Prof. Donald Kossmann
Supervised by: Prof. Donald Kossmann
A variety of applications require low-latency processing of data that comes in highlydynamic streams of items. These applications are implemented using Data Stream Management Systems (DSMSs). More recently, new application domains like real-time business intelligence turned to the “on-the-fly” processing model employed by these systems for a solution to their challenges. As a result, the requirements imposed on the DSMSs have become more complex: e.g., mechanisms for correlating data streams with stored information or near-real time complex analysis of large portions of streaming data.
In order to meet the evolving requirements of modern streaming applications, a clean, flexible and high performance DSMS design is required. Although many system implementations were proposed, none of them offers a clean, systematic approach to data storage management. Rather, the storage manager is usually tightly coupled with the continuous query execution engine. This design decision limits the possibility for further performance improvement and severely restricts the flexibility necessary to accommodate new application requirements. Moreover, today, there is no standard for querying
streams and, as a result, each DSMS exposes its own execution semantics, making the implementation of the new requirements even more challenging.
This dissertation investigates the design and implementation of a general-purpose storage management framework for Data Stream Management Systems, that we name SMS (Storage Manager for Streams). The ultimate goal of this framework is to provide a general, clean, flexible and high-performance storage management system which could be virtually “plugged” into any DSMS. In order to achieve this goal, in this work, we combine the experience gained over decades of research on Database Management Systems with the high-performance mechanisms employed by the Data Stream Management Systems.
Following the database systems architecture design, this framework is based on the principle of separating concerns: the query processor is decoupled from the storage manager. As such, the storage system obtains the flexibility necessary to accommodate new requirements, behind a general interface. Moreover, it can provide specialized store implementations tailored to the particular requirements of the applications, which is key to achieving good performance. In this respect, an important contribution of the framework is the reuse of the access patterns of the continuous query operators to tune the stores’ implementation and as such, to speed up the access on materialized data. In addition, the unified transactional model proposed in this dissertation makes minimal extensions to the traditional transactional model in order to accommodate streams and continuous queries. As a result, it offers a clean semantics for continuous query execution over arbitrary combinations of data sources (streaming and stored) in the presence of concurrent access and failures. And even more, it can be used to explain the transactional behavior of state-of-the-art DSMSs.
A series of experiments are conducted using the Linear Road streaming benchmark’s implementation in MXQuery (a Java-based open-source XQuery engine, extended with window functions for continuous processing). MXQuery uses SMS for all its data storage related tasks. Our experiments show that the response time of the continuous queries can indeed be lowered if the store implementations are tuned according to the access patterns of the continuous query operators. Moreover, a transaction manager implementing the unified transactional model and designed as an additional component between the access and storage layers of SMS provides correctness and reliability for the Linear Road application with practically no performance penalty. As such, the experimental results indicate that a storage manager built on these ideas is a promising approach.
@phdthesis{abc, abstract = {A variety of applications require low-latency processing of data that comes in highlydynamic streams of items. These applications are implemented using Data Stream Management Systems (DSMSs). More recently, new application domains like real-time business intelligence turned to the {\textquotedblleft}on-the-fly{\textquotedblright} processing model employed by these systems for a solution to their challenges. As a result, the requirements imposed on the DSMSs have become more complex: e.g., mechanisms for correlating data streams with stored information or near-real time complex analysis of large portions of streaming data. In order to meet the evolving requirements of modern streaming applications, a clean, flexible and high performance DSMS design is required. Although many system implementations were proposed, none of them offers a clean, systematic approach to data storage management. Rather, the storage manager is usually tightly coupled with the continuous query execution engine. This design decision limits the possibility for further performance improvement and severely restricts the flexibility necessary to accommodate new application requirements. Moreover, today, there is no standard for querying streams and, as a result, each DSMS exposes its own execution semantics, making the implementation of the new requirements even more challenging. This dissertation investigates the design and implementation of a general-purpose storage management framework for Data Stream Management Systems, that we name SMS (Storage Manager for Streams). The ultimate goal of this framework is to provide a general, clean, flexible and high-performance storage management system which could be virtually {\textquotedblleft}plugged{\textquotedblright} into any DSMS. In order to achieve this goal, in this work, we combine the experience gained over decades of research on Database Management Systems with the high-performance mechanisms employed by the Data Stream Management Systems. Following the database systems architecture design, this framework is based on the principle of separating concerns: the query processor is decoupled from the storage manager. As such, the storage system obtains the flexibility necessary to accommodate new requirements, behind a general interface. Moreover, it can provide specialized store implementations tailored to the particular requirements of the applications, which is key to achieving good performance. In this respect, an important contribution of the framework is the reuse of the access patterns of the continuous query operators to tune the stores{\textquoteright} implementation and as such, to speed up the access on materialized data. In addition, the unified transactional model proposed in this dissertation makes minimal extensions to the traditional transactional model in order to accommodate streams and continuous queries. As a result, it offers a clean semantics for continuous query execution over arbitrary combinations of data sources (streaming and stored) in the presence of concurrent access and failures. And even more, it can be used to explain the transactional behavior of state-of-the-art DSMSs. A series of experiments are conducted using the Linear Road streaming benchmark{\textquoteright}s implementation in MXQuery (a Java-based open-source XQuery engine, extended with window functions for continuous processing). MXQuery uses SMS for all its data storage related tasks. Our experiments show that the response time of the continuous queries can indeed be lowered if the store implementations are tuned according to the access patterns of the continuous query operators. Moreover, a transaction manager implementing the unified transactional model and designed as an additional component between the access and storage layers of SMS provides correctness and reliability for the Linear Road application with practically no performance penalty. As such, the experimental results indicate that a storage manager built on these ideas is a promising approach.}, author = {Irina Botan}, school = {19694}, title = {Storage Management Techniques for Stream Processing}, year = {2011} }
VLDB J., January 2011
This paper addresses the problem of minimizing
the staleness of query results for streaming applications
with update semantics under overload conditions. Staleness
is a measure of how out-of-date the results are compared
with the latest data arriving on the input. Real-time streaming
applications are subject to overload due to unpredictably
increasing data rates, while in many of them, we observe that
data streams and queries in fact exhibit update semantics
(i.e., the latest input data are all that really matters when
producing a query result). Under such semantics, overload
will cause staleness to build up. The key to avoid this is to
exploit the update semantics of applications as early as possible
in the processing pipeline. In this paper, we propose
UpStream, a storage-centric framework for load management
over streaming applications with update semantics.We
first describe how we model streams and queries that possess
the update semantics, providing definitions for correctness
and staleness for the query results. Then, we show how staleness
can be minimized based on intelligent update key scheduling
techniques applied at the queue level, while preserving
the correctness of the results, even for complex queries that
involve sliding windows. UpStream is based on the simple
idea of applying the updates in place, yet with great returns
in terms of lowering staleness and memory consumption, as
we also experimentally verify on the Borealis system.
@inproceedings{abc, abstract = {This paper addresses the problem of minimizing the staleness of query results for streaming applications with update semantics under overload conditions. Staleness is a measure of how out-of-date the results are compared with the latest data arriving on the input. Real-time streaming applications are subject to overload due to unpredictably increasing data rates, while in many of them, we observe that data streams and queries in fact exhibit \&$\#$147;update semantics\&$\#$148; (i.e., the latest input data are all that really matters when producing a query result). Under such semantics, overload will cause staleness to build up. The key to avoid this is to exploit the update semantics of applications as early as possible in the processing pipeline. In this paper, we propose UpStream, a storage-centric framework for load management over streaming applications with update semantics.We first describe how we model streams and queries that possess the update semantics, providing definitions for correctness and staleness for the query results. Then, we show how staleness can be minimized based on intelligent update key scheduling techniques applied at the queue level, while preserving the correctness of the results, even for complex queries that involve sliding windows. UpStream is based on the simple idea of applying the updates in place, yet with great returns in terms of lowering staleness and memory consumption, as we also experimentally verify on the Borealis system.}, author = {Alexandru Moga and Irina Botan and Nesime Tatbul}, booktitle = {VLDB J.}, title = {UpStream: storage-centric load management for streaming applications with update semantics}, url = {http://dx.doi.org/10.1007/s00778-011-0229-7}, year = {2011} }
2010
Proceedings of the 26th International Conference on Data Engineering, ICDE 2010, Long Beach, California, USA, January 2010
@inproceedings{abc, author = {Irina Botan and Younggoo Cho and Roozbeh Derakhshan and Nihal Dindar and Ankush Gupta and Laura M. Haas and Kihong Kim and Chulwon Lee and Girish Mundada and Ming-Chien Shan and Nesime Tatbul and Ying Yan and Beomjin Yun and Jin Zhang}, booktitle = {Proceedings of the 26th International Conference on Data Engineering, ICDE 2010}, title = {A demonstration of the MaxStream federated stream processing system.}, url = {http://dx.doi.org/10.1109/ICDE.2010.5447906}, venue = {Long Beach, California, USA}, year = {2010} }
PVLDB, Proceedings of the 36th International Conference on Very Large Data Bases (VLDB'10), Singapore, September 2010., January 2010
@inproceedings{abc, author = {Irina Botan and Roozbeh Derakhshan and Nihal Dindar and Laura M. Haas and Ren{\'e}e J. Miller and Nesime Tatbul}, booktitle = {PVLDB}, title = {SECRET: A Model for Analysis of the Execution Semantics of Stream Processing Systems}, venue = {Proceedings of the 36th International Conference on Very Large Data Bases (VLDB{\textquoteright}10), Singapore, September 2010.}, year = {2010} }
2009
Research, January 2009
Data Stream Management Systems (DSMS) operate under strict
performance requirements. Key to meeting such requirements is to
efficiently handle time-critical tasks such as managing internal
states of continuous query operators, traffic on the queues
between operators, as well as providing storage support for
shared computation and archived data. In this paper, we
introduce a general purpose storage management framework for
DSMSs that performs these tasks based on a clean,
loosely-coupled, and flexible system design that also
facilitates performance optimization. An important contribution
of the framework is that, in analogy to buffer management
techniques in relational database systems, it uses information
about the access patterns of streaming applications to tune and
customize the performance of the storage manager. In the paper,
we first analyze typical application requirements at different
granularities in order to identify important tunable parameters
and their corresponding values. Based on these parameters, we
define a general-purpose storage management interface. Using the
interface, a developer can use our SMS (Storage Manager for
Streams) to generate a customized storage manager for streaming
applications. We explore the performance and potential of SMS
through a set of experiments using the Linear Road
benchmark.
@inproceedings{abc, abstract = { Data Stream Management Systems (DSMS) operate under strict performance requirements. Key to meeting such requirements is to efficiently handle time-critical tasks such as managing internal states of continuous query operators, traffic on the queues between operators, as well as providing storage support for shared computation and archived data. In this paper, we introduce a general purpose storage management framework for DSMSs that performs these tasks based on a clean, loosely-coupled, and flexible system design that also facilitates performance optimization. An important contribution of the framework is that, in analogy to buffer management techniques in relational database systems, it uses information about the access patterns of streaming applications to tune and customize the performance of the storage manager. In the paper, we first analyze typical application requirements at different granularities in order to identify important tunable parameters and their corresponding values. Based on these parameters, we define a general-purpose storage management interface. Using the interface, a developer can use our SMS (Storage Manager for Streams) to generate a customized storage manager for streaming applications. We explore the performance and potential of SMS through a set of experiments using the Linear Road benchmark. }, author = {Irina Botan and Gustavo Alonso and Nesime Tatbul and Donald Kossmann and Peter M. Fischer}, booktitle = {Research}, title = {Flexible and Scalable Storage Management for Data-intensive Stream Processing}, url = {http://doi.acm.org/10.1145/1516360.1516467}, year = {2009} }
January 2009
@techreport{abc, author = {Irina Botan and Younggoo Cho and Roozbeh Derakhshan and Laura M. Haas and Kihong Kim and Chulwon Lee and Girish Mundada and Ming-Chien Shan and Nesime Tatbul and Ying Yan and Beomjin Yun and Jin Zhang}, title = {Design and Implementation of the MaxStream Federated Stream Processing Architecture}, year = {2009} }
January 2009
@techreport{abc, author = {Irina Botan and Nesime Tatbul and Alexandru Moga}, title = {UpStream: Storage-centric Load Management for Data Streams with Update Semantics}, year = {2009} }
Enabling Real-Time Business Intelligence - Third International Workshop, BIRTE 2009, Held at the 35th International Conference on Very Large Databases, VLDB 2009, Lyon, France, Revised Selected Papers, January 2009
@inproceedings{abc, author = {Irina Botan and Younggoo Cho and Roozbeh Derakhshan and Nihal Dindar and Laura Haas and Kihong Kim and Nesime Tatbul}, booktitle = {Enabling Real-Time Business Intelligence - Third International Workshop, BIRTE 2009, Held at the 35th International Conference on Very Large Databases, VLDB 2009, Lyon, France}, title = {Federated Stream Processing Support for Real-Time Business Intelligence Applications}, url = {http://dx.doi.org/10.1007/978-3-642-14559-9_2}, venue = {Revised Selected Papers}, year = {2009} }
2007
Proceedings of the 33rd International Conference on Very Large Data Bases, University of Vienna, Austria, January 2007
@inproceedings{abc, author = {Irina Botan and Peter M. Fischer and Daniela Florescu and Donald Kossmann and Tim Kraska and Rokas Tamosevicius}, booktitle = {Proceedings of the 33rd International Conference on Very Large Data Bases, University of Vienna, Austria}, title = {Extending XQuery with Window Functions.}, url = {http://www.vldb.org/conf/2007/papers/research/p75-botan.pdf}, year = {2007} }
2006
January 2006
@techreport{abc, author = {Irina Botan and Peter M. Fischer and Daniela Florescu and Donald Kossmann and Tim Kraska and Rokas Tamosevicius}, title = {Extending XQuery with Window Functions}, year = {2006} }