Publications by Todd C. Mowry
2017
Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture, Cambridge, MA, USA, October 2017
Many important applications trigger bulk bitwise operations, i.e., bitwise operations on large bit vectors. In fact, recent works design techniques that exploit fast bulk bitwise operations to accelerate databases (bitmap indices, BitWeaving) and web search (BitFunnel). Unfortunately, in existing architectures, the throughput of bulk bitwise operations is limited by the memory bandwidth available to the processing unit (e.g., CPU, GPU, FPGA, processing-in-memory).
To overcome this bottleneck, we propose Ambit, an Accelerator-in-Memory for bulk bitwise operations. Unlike prior works, Ambit exploits the analog operation of DRAM technology to perform bitwise operations completely inside DRAM, thereby exploiting the full internal DRAM bandwidth. Ambit consists of two components. First, simultaneous activation of three DRAM rows that share the same set of sense amplifiers enables the system to perform bitwise AND and OR operations. Second, with modest changes to the sense amplifier, the system can use the inverters present inside the sense amplifier to perform bitwise NOT operations. With these two components, Ambit can perform any bulk bitwise operation efficiently inside DRAM. Ambit largely exploits existing DRAM structure, and hence incurs low cost on top of commodity DRAM designs (1% of DRAM chip area). Importantly, Ambit uses the modern DRAM interface without any changes, and therefore it can be directly plugged onto the memory bus.
Our extensive circuit simulations show that Ambit works as expected even in the presence of significant process variation. Averaged across seven bulk bitwise operations, Ambit improves performance by 32X and reduces energy consumption by 35X compared to state-of-the-art systems. When integrated with Hybrid Memory Cube (HMC), a 3D-stacked DRAM with a logic layer, Ambit improves performance of bulk bitwise operations by 9.7X compared to processing in the logic layer of the HMC. Ambit improves the performance of three real-world data-intensive applications, 1) database bitmap indices, 2) BitWeaving, a technique to accelerate database scans, and 3) bit-vector-based implementation of sets, by 3X-7X compared to a state-of-the-art baseline using SIMD optimizations. We describe four other applications that can benefit from Ambit, including a recent technique proposed to speed up web search. We believe that large performance and energy improvements provided by Ambit can enable other applications to use bulk bitwise operations.
@inproceedings{abc, abstract = {Many important applications trigger bulk bitwise operations, i.e., bitwise operations on large bit vectors. In fact, recent works design techniques that exploit fast bulk bitwise operations to accelerate databases (bitmap indices, BitWeaving) and web search (BitFunnel). Unfortunately, in existing architectures, the throughput of bulk bitwise operations is limited by the memory bandwidth available to the processing unit (e.g., CPU, GPU, FPGA, processing-in-memory). To overcome this bottleneck, we propose Ambit, an Accelerator-in-Memory for bulk bitwise operations. Unlike prior works, Ambit exploits the analog operation of DRAM technology to perform bitwise operations completely inside DRAM, thereby exploiting the full internal DRAM bandwidth. Ambit consists of two components. First, simultaneous activation of three DRAM rows that share the same set of sense amplifiers enables the system to perform bitwise AND and OR operations. Second, with modest changes to the sense amplifier, the system can use the inverters present inside the sense amplifier to perform bitwise NOT operations. With these two components, Ambit can perform any bulk bitwise operation efficiently inside DRAM. Ambit largely exploits existing DRAM structure, and hence incurs low cost on top of commodity DRAM designs (1\% of DRAM chip area). Importantly, Ambit uses the modern DRAM interface without any changes, and therefore it can be directly plugged onto the memory bus. Our extensive circuit simulations show that Ambit works as expected even in the presence of significant process variation. Averaged across seven bulk bitwise operations, Ambit improves performance by 32X and reduces energy consumption by 35X compared to state-of-the-art systems. When integrated with Hybrid Memory Cube (HMC), a 3D-stacked DRAM with a logic layer, Ambit improves performance of bulk bitwise operations by 9.7X compared to processing in the logic layer of the HMC. Ambit improves the performance of three real-world data-intensive applications, 1) database bitmap indices, 2) BitWeaving, a technique to accelerate database scans, and 3) bit-vector-based implementation of sets, by 3X-7X compared to a state-of-the-art baseline using SIMD optimizations. We describe four other applications that can benefit from Ambit, including a recent technique proposed to speed up web search. We believe that large performance and energy improvements provided by Ambit can enable other applications to use bulk bitwise operations.}, author = {Vivek Seshadri and Donghyuk Lee and Thomas Mullins and Hasan Hassan and Amirali Boroumand and Jeremie Kim and Michael A. Kozuch and Onur Mutlu and Phillip B. Gibbons and Todd C. Mowry}, booktitle = {Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture}, title = {Ambit: in-memory accelerator for bulk bitwise operations using commodity DRAM technology}, venue = {Cambridge, MA, USA}, year = {2017} }
2016
2016 IEEE International Symposium on High Performance Computer Architecture, HPCA 2016, Barcelona, Spain, March 2016
@inproceedings{abc, author = {Gennady Pekhimenko and Evgeny Bolotin and Nandita Vijaykumar and Onur Mutlu and Todd C. Mowry and Stephen W. Keckler}, booktitle = {2016 IEEE International Symposium on High Performance Computer Architecture, HPCA 2016, Barcelona, Spain}, title = {A case for toggle-aware compression for GPU systems.}, url = {http://dx.doi.org/10.1109/HPCA.2016.7446064}, year = {2016} }
CoRR, January 2016
Modern Graphics Processing Units (GPUs) are well provisioned to support the concurrent execution of thousands of threads. Unfortunately, different bottlenecks during execution and heterogeneous application requirements create imbalances in utilization of resources in the cores. For example, when a GPU is bottlenecked by the available off-chip memory bandwidth, its computational resources are often overwhelmingly idle, waiting for data from memory to arrive.
This work describes the Core-Assisted Bottleneck Acceleration (CABA) framework that employs idle on-chip resources to alleviate different bottlenecks in GPU execution. CABA provides flexible mechanisms to automatically generate "assist warps" that execute on GPU cores to perform specific tasks that can improve GPU performance and efficiency.
CABA enables the use of idle computational units and pipelines to alleviate the memory bandwidth bottleneck, e.g., by using assist warps to perform data compression to transfer less data from memory. Conversely, the same framework can be employed to handle cases where the GPU is bottlenecked by the available computational units, in which case the memory pipelines are idle and can be used by CABA to speed up computation, e.g., by performing memoization using assist warps.
We provide a comprehensive design and evaluation of CABA to perform effective and flexible data compression in the GPU memory hierarchy to alleviate the memory bandwidth bottleneck. Our extensive evaluations show that CABA, when used to implement data compression, provides an average performance improvement of 41.7% (as high as 2.6X) across a variety of memory-bandwidth-sensitive GPGPU applications.
@article{abc, abstract = {Modern Graphics Processing Units (GPUs) are well provisioned to support the concurrent execution of thousands of threads. Unfortunately, different bottlenecks during execution and heterogeneous application requirements create imbalances in utilization of resources in the cores. For example, when a GPU is bottlenecked by the available off-chip memory bandwidth, its computational resources are often overwhelmingly idle, waiting for data from memory to arrive. This work describes the Core-Assisted Bottleneck Acceleration (CABA) framework that employs idle on-chip resources to alleviate different bottlenecks in GPU execution. CABA provides flexible mechanisms to automatically generate "assist warps" that execute on GPU cores to perform specific tasks that can improve GPU performance and efficiency. CABA enables the use of idle computational units and pipelines to alleviate the memory bandwidth bottleneck, e.g., by using assist warps to perform data compression to transfer less data from memory. Conversely, the same framework can be employed to handle cases where the GPU is bottlenecked by the available computational units, in which case the memory pipelines are idle and can be used by CABA to speed up computation, e.g., by performing memoization using assist warps. We provide a comprehensive design and evaluation of CABA to perform effective and flexible data compression in the GPU memory hierarchy to alleviate the memory bandwidth bottleneck. Our extensive evaluations show that CABA, when used to implement data compression, provides an average performance improvement of 41.7\% (as high as 2.6X) across a variety of memory-bandwidth-sensitive GPGPU applications.}, author = {Nandita Vijaykumar and Gennady Pekhimenko and Adwait Jog and Saugata Ghose and Abhishek Bhowmick and Rachata Ausavarungnirun and Chita R. Das and Mahmut T. Kandemir and Todd C. Mowry and Onur Mutlu}, journal = {CoRR}, title = {A Framework for Accelerating Bottlenecks in GPU Execution with Assist Warps.}, url = {http://arxiv.org/abs/1602.01348}, year = {2016} }
CoRR, January 2016
@article{abc, author = {Vivek Seshadri and Donghyuk Lee and Thomas Mullins and Hasan Hassan and Amirali Boroumand and Jeremie Kim and Michael A. Kozuch and Onur Mutlu and Phillip B. Gibbons and Todd C. Mowry}, journal = {CoRR}, title = {Buddy-RAM: Improving the Performance and Efficiency of Bulk Bitwise Operations Using DRAM.}, url = {http://arxiv.org/abs/1611.09988}, year = {2016} }
TACO, January 2016
@inproceedings{abc, author = {Amir Yazdanbakhsh and Gennady Pekhimenko and Bradley Thwaites and Hadi Esmaeilzadeh and Onur Mutlu and Todd C. Mowry}, booktitle = {TACO}, title = {RFVP: Rollback-Free Value Prediction with Safe-to-Approximate Loads.}, url = {http://doi.acm.org/10.1145/2836168}, year = {2016} }
IEEE Design Test, January 2016
@article{abc, author = {Amir Yazdanbakhsh and Bradley Thwaites and Hadi Esmaeilzadeh and Gennady Pekhimenko and Onur Mutlu and Todd C. Mowry}, journal = {IEEE Design Test}, title = {Mitigating the Memory Bottleneck With Approximate Load Value Prediction.}, url = {http://dx.doi.org/10.1109/MDAT.2015.2504899}, year = {2016} }
2015
Proceedings of the 48th International Symposium on Microarchitecture, MICRO 2015, Waikiki, HI, USA, December 2015
@inproceedings{abc, author = {Vivek Seshadri and Thomas Mullins and Amirali Boroumand and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {Proceedings of the 48th International Symposium on Microarchitecture, MICRO 2015, Waikiki, HI, USA}, title = {Gather-scatter DRAM: in-DRAM address translation to improve the spatial locality of non-unit strided accesses.}, url = {http://doi.acm.org/10.1145/2830772.2830820}, year = {2015} }
Proceedings of the 42nd Annual International Symposium on Computer Architecture, Portland, OR, USA, June 2015
Modern Graphics Processing Units (GPUs) are well provisioned to support the concurrent execution of thousands of threads. Unfortunately, diUerent bottlenecks during execution and heterogeneous application requirements create imbalances in utilization of resources in the
cores. For example, when a GPU is bottlenecked by the available on-chip memory bandwidth, its computational resources are often overwhelmingly idle, waiting for data from memory to arrive. This paper introduces the Core-Assisted Bottleneck Acceleration (CABA) framework that employs idle on-chip resources to alleviate different bottlenecks in GPU execution. CABA provides flexible mechanisms to automatically generate “assist warps” that execute on GPU cores to perform specific tasks that can improve GPU performance and
efficiency. CABA enables the use of idle computational units and pipelines to alleviate the memory bandwidth bottleneck, e.g., by using assist warps to perform data compression to transfer less data from memory. Conversely, the same framework can be employed to handle cases where the GPU is bottlenecked by the available computational units, in which case the memory pipelines are idle and can be used by CABA to speed up computation, e.g., by performing memoization using assist warps. We provide a comprehensive design and evaluation of CABA to perform effective and flexible data compression in the GPU memory hierarchy to alleviate the memory bandwidth bottleneck. Our extensive evaluations show that CABA, when used to implement data compression, provides an average performance improvement of 41.7% (as high as 2.6X) across a variety of memory-bandwidth-sensitive GPGPU applications.
@inproceedings{abc, abstract = {Modern Graphics Processing Units (GPUs) are well provisioned to support the concurrent execution of thousands of threads. Unfortunately, diUerent bottlenecks during execution and heterogeneous application requirements create imbalances in utilization of resources in the cores. For example, when a GPU is bottlenecked by the available on-chip memory bandwidth, its computational resources are often overwhelmingly idle, waiting for data from memory to arrive. This paper introduces the Core-Assisted Bottleneck Acceleration (CABA) framework that employs idle on-chip resources to alleviate different bottlenecks in GPU execution. CABA provides flexible mechanisms to automatically generate {\textquotedblleft}assist warps{\textquotedblright} that execute on GPU cores to perform specific tasks that can improve GPU performance and efficiency. CABA enables the use of idle computational units and pipelines to alleviate the memory bandwidth bottleneck, e.g., by using assist warps to perform data compression to transfer less data from memory. Conversely, the same framework can be employed to handle cases where the GPU is bottlenecked by the available computational units, in which case the memory pipelines are idle and can be used by CABA to speed up computation, e.g., by performing memoization using assist warps. We provide a comprehensive design and evaluation of CABA to perform effective and flexible data compression in the GPU memory hierarchy to alleviate the memory bandwidth bottleneck. Our extensive evaluations show that CABA, when used to implement data compression, provides an average performance improvement of 41.7\% (as high as 2.6X) across a variety of memory-bandwidth-sensitive GPGPU applications.}, author = {Nandita Vijaykumar and Gennady Pekhimenko and Adwait Jog and Abhishek Bhowmick and Rachata Ausavarungnirun and Chita R. Das and Mahmut T. Kandemir and Todd C. Mowry and Onur Mutlu}, booktitle = {Proceedings of the 42nd Annual International Symposium on Computer Architecture}, title = {A case for core-assisted bottleneck acceleration in GPUs: enabling flexible data compression with assist warps.}, url = {http://doi.acm.org/10.1145/2749469.2750399}, venue = {Portland, OR, USA}, year = {2015} }
Proceedings of the 42nd Annual International Symposium on Computer Architecture, Portland, OR, USA, June 2015
@inproceedings{abc, author = {Vivek Seshadri and Gennady Pekhimenko and Olatunji Ruwase and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry and Trishul M. Chilimbi}, booktitle = {Proceedings of the 42nd Annual International Symposium on Computer Architecture, Portland, OR, USA}, title = {Page overlays: an enhanced virtual memory framework to enable fine-grained memory management.}, url = {http://doi.acm.org/10.1145/2749469.2750379}, year = {2015} }
21st IEEE International Symposium on High Performance Computer Architecture, HPCA 2015, Burlingame, CA, USA, February 2015
@inproceedings{abc, author = {Gennady Pekhimenko and Tyler Huberty and Rui Cai and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {21st IEEE International Symposium on High Performance Computer Architecture, HPCA 2015, Burlingame, CA, USA}, title = {Exploiting compressed block size as an indicator of future reuse.}, url = {http://dx.doi.org/10.1109/HPCA.2015.7056021}, year = {2015} }
Computer Architecture Letters, January 2015
@inproceedings{abc, author = {Gennady Pekhimenko and Evgeny Bolotin and Mike O{\textquoteright}Connor and Onur Mutlu and Todd C. Mowry and Stephen W. Keckler}, booktitle = {Computer Architecture Letters}, title = {Toggle-Aware Compression for GPUs.}, url = {http://dx.doi.org/10.1109/LCA.2015.2430853}, year = {2015} }
Computer Architecture Letters, January 2015
@inproceedings{abc, author = {Vivek Seshadri and Kevin Hsieh and Amirali Boroumand and Donghyuk Lee and Michael A. Kozuch and Onur Mutlu and Phillip B. Gibbons and Todd C. Mowry}, booktitle = {Computer Architecture Letters}, title = {Fast Bulk Bitwise AND and OR in DRAM.}, url = {http://dx.doi.org/10.1109/LCA.2015.2434872}, year = {2015} }
2014
International Conference on Parallel Architectures and Compilation, PACT '14, Edmonton, AB, Canada, August 2014
@inproceedings{abc, author = {Bradley Thwaites and Gennady Pekhimenko and Hadi Esmaeilzadeh and Amir Yazdanbakhsh and Onur Mutlu and Jongse Park and Girish Mururu and Todd C. Mowry}, booktitle = {International Conference on Parallel Architectures and Compilation, PACT {\textquoteright}14, Edmonton, AB, Canada}, title = {Rollback-free value prediction with approximate loads.}, url = {http://doi.acm.org/10.1145/2628071.2628110}, year = {2014} }
ACM/IEEE 41st International Symposium on Computer Architecture, ISCA 2014, Minneapolis, MN, USA, June 2014
@inproceedings{abc, author = {Vivek Seshadri and Abhishek Bhowmick and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {ACM/IEEE 41st International Symposium on Computer Architecture, ISCA 2014, Minneapolis, MN, USA}, title = {The Dirty-Block Index.}, url = {http://dx.doi.org/10.1109/ISCA.2014.6853204}, year = {2014} }
TACO, January 2014
@inproceedings{abc, author = {Vivek Seshadri and Samihan Yedkar and Hongyi Xin and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {TACO}, title = {Mitigating Prefetcher-Caused Pollution Using Informed Caching Policies for Prefetched Blocks.}, url = {http://doi.acm.org/10.1145/2677956}, year = {2014} }
2013
The 46th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO-46, Davis, CA, USA, December 2013
@inproceedings{abc, author = {Vivek Seshadri and Yoongu Kim and Chris Fallin and Donghyuk Lee and Rachata Ausavarungnirun and Gennady Pekhimenko and Yixin Luo and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {The 46th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO-46, Davis, CA, USA}, title = {RowClone: fast and energy-efficient in-DRAM bulk data copy and initialization.}, url = {http://doi.acm.org/10.1145/2540708.2540725}, year = {2013} }
The 46th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO-46, Davis, CA, USA, December 2013
@inproceedings{abc, author = {Gennady Pekhimenko and Vivek Seshadri and Yoongu Kim and Hongyi Xin and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {The 46th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO-46, Davis, CA, USA}, title = {Linearly compressed pages: a low-complexity, low-latency main memory compression framework.}, url = {http://doi.acm.org/10.1145/2540708.2540724}, year = {2013} }
2012
International Conference on Parallel Architectures and Compilation Techniques, PACT '12, Minneapolis, MN, September 2012
@inproceedings{abc, author = {Gennady Pekhimenko and Todd C. Mowry and Onur Mutlu}, booktitle = {International Conference on Parallel Architectures and Compilation Techniques, PACT {\textquoteright}12, Minneapolis, MN}, title = {Linearly compressed pages: a main memory compression framework with low complexity and low latency.}, url = {http://doi.acm.org/10.1145/2370816.2370911}, year = {2012} }
International Conference on Parallel Architectures and Compilation Techniques, PACT '12, Minneapolis, MN, September 2012
@inproceedings{abc, author = {Vivek Seshadri and Onur Mutlu and Michael A. Kozuch and Todd C. Mowry}, booktitle = {International Conference on Parallel Architectures and Compilation Techniques, PACT {\textquoteright}12, Minneapolis, MN}, title = {The evicted-address filter: a unified mechanism to address both cache pollution and thrashing.}, url = {http://doi.acm.org/10.1145/2370816.2370868}, year = {2012} }
International Conference on Parallel Architectures and Compilation Techniques, PACT '12, Minneapolis, MN, September 2012
@inproceedings{abc, author = {Gennady Pekhimenko and Vivek Seshadri and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {International Conference on Parallel Architectures and Compilation Techniques, PACT {\textquoteright}12, Minneapolis, MN}, title = {Base-delta-immediate compression: practical data compression for on-chip caches.}, url = {http://doi.acm.org/10.1145/2370816.2370870}, year = {2012} }