Publications by Gennady Pekhimenko
2017
Proceedings of the 2017 ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems, Urbana-Champaign, IL, USA, June 2017
@inproceedings{abc, author = {Donghyuk Lee and Samira Manabi Khan and Lavanya Subramanian and Saugata Ghose and Rachata Ausavarungnirun and Gennady Pekhimenko and Vivek Seshadri and Onur Mutlu}, booktitle = {Proceedings of the 2017 ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems, Urbana-Champaign, IL, USA}, title = {Design-Induced Latency Variation in Modern DRAM Chips: Characterization, Analysis, and Latency Reduction Mechanisms.}, url = {http://doi.acm.org/10.1145/3078505.3078533}, year = {2017} }
2017 IEEE International Symposium on High Performance Computer Architecture, HPCA 2017, Austin, TX, USA, February 2017
@inproceedings{abc, author = {Hasan Hassan and Nandita Vijaykumar and Samira Manabi Khan and Saugata Ghose and Kevin K. Chang and Gennady Pekhimenko and Donghyuk Lee and Oguz Ergin and Onur Mutlu}, booktitle = {2017 IEEE International Symposium on High Performance Computer Architecture, HPCA 2017, Austin, TX, USA}, title = {SoftMC: A Flexible and Practical Open-Source Infrastructure for Enabling Experimental DRAM Studies.}, url = {https://doi.org/10.1109/HPCA.2017.62}, year = {2017} }
POMACS, January 2017
@article{abc, author = {Donghyuk Lee and Samira Manabi Khan and Lavanya Subramanian and Saugata Ghose and Rachata Ausavarungnirun and Gennady Pekhimenko and Vivek Seshadri and Onur Mutlu}, journal = {POMACS}, title = {Design-Induced Latency Variation in Modern DRAM Chips: Characterization, Analysis, and Latency Reduction Mechanisms.}, url = {http://doi.acm.org/10.1145/3084464}, year = {2017} }
2016
49th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO 2016, Taipei, Taiwan, October 2016
@inproceedings{abc, author = {Nandita Vijaykumar and Kevin Hsieh and Gennady Pekhimenko and Samira Manabi Khan and Ashish Shrestha and Saugata Ghose and Adwait Jog and Phillip B. Gibbons and Onur Mutlu}, booktitle = {49th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO 2016, Taipei, Taiwan}, title = {Zorua: A holistic approach to resource virtualization in GPUs.}, url = {http://dx.doi.org/10.1109/MICRO.2016.7783718}, year = {2016} }
Proceedings of the 2016 ACM SIGMETRICS International Conference on Measurement and Modeling of Computer Science, Antibes Juan-Les-Pins, France, June 2016
@inproceedings{abc, author = {Kevin K. Chang and Abhijith Kashyap and Hasan Hassan and Saugata Ghose and Kevin Hsieh and Donghyuk Lee and Tianshi Li and Gennady Pekhimenko and Samira Manabi Khan and Onur Mutlu}, booktitle = {Proceedings of the 2016 ACM SIGMETRICS International Conference on Measurement and Modeling of Computer Science, Antibes Juan-Les-Pins, France}, title = {Understanding Latency Variation in Modern DRAM Chips: Experimental Characterization, Analysis, and Optimization.}, url = {http://doi.acm.org/10.1145/2896377.2901453}, year = {2016} }
2016 IEEE International Symposium on High Performance Computer Architecture, HPCA 2016, Barcelona, Spain, March 2016
@inproceedings{abc, author = {Gennady Pekhimenko and Evgeny Bolotin and Nandita Vijaykumar and Onur Mutlu and Todd C. Mowry and Stephen W. Keckler}, booktitle = {2016 IEEE International Symposium on High Performance Computer Architecture, HPCA 2016, Barcelona, Spain}, title = {A case for toggle-aware compression for GPU systems.}, url = {http://dx.doi.org/10.1109/HPCA.2016.7446064}, year = {2016} }
2016 IEEE International Symposium on High Performance Computer Architecture, HPCA 2016, Barcelona, Spain, March 2016
@inproceedings{abc, author = {Hasan Hassan and Gennady Pekhimenko and Nandita Vijaykumar and Vivek Seshadri and Donghyuk Lee and Oguz Ergin and Onur Mutlu}, booktitle = {2016 IEEE International Symposium on High Performance Computer Architecture, HPCA 2016, Barcelona, Spain}, title = {ChargeCache: Reducing DRAM latency by exploiting row access locality.}, url = {http://dx.doi.org/10.1109/HPCA.2016.7446096}, year = {2016} }
IEEE Design Test, January 2016
@article{abc, author = {Amir Yazdanbakhsh and Bradley Thwaites and Hadi Esmaeilzadeh and Gennady Pekhimenko and Onur Mutlu and Todd C. Mowry}, journal = {IEEE Design Test}, title = {Mitigating the Memory Bottleneck With Approximate Load Value Prediction.}, url = {http://dx.doi.org/10.1109/MDAT.2015.2504899}, year = {2016} }
CoRR, January 2016
Modern Graphics Processing Units (GPUs) are well provisioned to support the concurrent execution of thousands of threads. Unfortunately, different bottlenecks during execution and heterogeneous application requirements create imbalances in utilization of resources in the cores. For example, when a GPU is bottlenecked by the available off-chip memory bandwidth, its computational resources are often overwhelmingly idle, waiting for data from memory to arrive.
This work describes the Core-Assisted Bottleneck Acceleration (CABA) framework that employs idle on-chip resources to alleviate different bottlenecks in GPU execution. CABA provides flexible mechanisms to automatically generate "assist warps" that execute on GPU cores to perform specific tasks that can improve GPU performance and efficiency.
CABA enables the use of idle computational units and pipelines to alleviate the memory bandwidth bottleneck, e.g., by using assist warps to perform data compression to transfer less data from memory. Conversely, the same framework can be employed to handle cases where the GPU is bottlenecked by the available computational units, in which case the memory pipelines are idle and can be used by CABA to speed up computation, e.g., by performing memoization using assist warps.
We provide a comprehensive design and evaluation of CABA to perform effective and flexible data compression in the GPU memory hierarchy to alleviate the memory bandwidth bottleneck. Our extensive evaluations show that CABA, when used to implement data compression, provides an average performance improvement of 41.7% (as high as 2.6X) across a variety of memory-bandwidth-sensitive GPGPU applications.
@article{abc, abstract = {Modern Graphics Processing Units (GPUs) are well provisioned to support the concurrent execution of thousands of threads. Unfortunately, different bottlenecks during execution and heterogeneous application requirements create imbalances in utilization of resources in the cores. For example, when a GPU is bottlenecked by the available off-chip memory bandwidth, its computational resources are often overwhelmingly idle, waiting for data from memory to arrive. This work describes the Core-Assisted Bottleneck Acceleration (CABA) framework that employs idle on-chip resources to alleviate different bottlenecks in GPU execution. CABA provides flexible mechanisms to automatically generate "assist warps" that execute on GPU cores to perform specific tasks that can improve GPU performance and efficiency. CABA enables the use of idle computational units and pipelines to alleviate the memory bandwidth bottleneck, e.g., by using assist warps to perform data compression to transfer less data from memory. Conversely, the same framework can be employed to handle cases where the GPU is bottlenecked by the available computational units, in which case the memory pipelines are idle and can be used by CABA to speed up computation, e.g., by performing memoization using assist warps. We provide a comprehensive design and evaluation of CABA to perform effective and flexible data compression in the GPU memory hierarchy to alleviate the memory bandwidth bottleneck. Our extensive evaluations show that CABA, when used to implement data compression, provides an average performance improvement of 41.7\% (as high as 2.6X) across a variety of memory-bandwidth-sensitive GPGPU applications.}, author = {Nandita Vijaykumar and Gennady Pekhimenko and Adwait Jog and Saugata Ghose and Abhishek Bhowmick and Rachata Ausavarungnirun and Chita R. Das and Mahmut T. Kandemir and Todd C. Mowry and Onur Mutlu}, journal = {CoRR}, title = {A Framework for Accelerating Bottlenecks in GPU Execution with Assist Warps.}, url = {http://arxiv.org/abs/1602.01348}, year = {2016} }
CoRR, January 2016
@article{abc, author = {Donghyuk Lee and Yoongu Kim and Gennady Pekhimenko and Samira Manabi Khan and Vivek Seshadri and Kevin Kai-Wei Chang and Onur Mutlu}, journal = {CoRR}, title = {Adaptive-Latency DRAM (AL-DRAM).}, url = {http://arxiv.org/abs/1603.08454}, year = {2016} }
Bioinformatics, January 2016
@inproceedings{abc, author = {Hongyi Xin and Sunny Nahar and Richard Zhu and John Emmons and Gennady Pekhimenko and Carl Kingsford and Can Alkan and Onur Mutlu}, booktitle = {Bioinformatics}, title = {Optimal seed solver: optimizing seed selection in read mapping.}, url = {http://dx.doi.org/10.1093/bioinformatics/btv670}, year = {2016} }
CoRR, January 2016
@article{abc, author = {Donghyuk Lee and Samira Manabi Khan and Lavanya Subramanian and Rachata Ausavarungnirun and Gennady Pekhimenko and Vivek Seshadri and Saugata Ghose and Onur Mutlu}, journal = {CoRR}, title = {Reducing DRAM Latency by Exploiting Design-Induced Latency Variation in Modern DRAM Chips.}, url = {http://arxiv.org/abs/1610.09604}, year = {2016} }
TACO, January 2016
@inproceedings{abc, author = {Amir Yazdanbakhsh and Gennady Pekhimenko and Bradley Thwaites and Hadi Esmaeilzadeh and Onur Mutlu and Todd C. Mowry}, booktitle = {TACO}, title = {RFVP: Rollback-Free Value Prediction with Safe-to-Approximate Loads.}, url = {http://doi.acm.org/10.1145/2836168}, year = {2016} }
TACO, January 2016
@inproceedings{abc, author = {Donghyuk Lee and Saugata Ghose and Gennady Pekhimenko and Samira Manabi Khan and Onur Mutlu}, booktitle = {TACO}, title = {Simultaneous Multi-Layer Access: Improving 3D-Stacked Memory Bandwidth at Low Cost.}, url = {http://doi.acm.org/10.1145/2832911}, year = {2016} }
2015
Proceedings of the 42nd Annual International Symposium on Computer Architecture, Portland, OR, USA, June 2015
Modern Graphics Processing Units (GPUs) are well provisioned to support the concurrent execution of thousands of threads. Unfortunately, diUerent bottlenecks during execution and heterogeneous application requirements create imbalances in utilization of resources in the
cores. For example, when a GPU is bottlenecked by the available on-chip memory bandwidth, its computational resources are often overwhelmingly idle, waiting for data from memory to arrive. This paper introduces the Core-Assisted Bottleneck Acceleration (CABA) framework that employs idle on-chip resources to alleviate different bottlenecks in GPU execution. CABA provides flexible mechanisms to automatically generate “assist warps” that execute on GPU cores to perform specific tasks that can improve GPU performance and
efficiency. CABA enables the use of idle computational units and pipelines to alleviate the memory bandwidth bottleneck, e.g., by using assist warps to perform data compression to transfer less data from memory. Conversely, the same framework can be employed to handle cases where the GPU is bottlenecked by the available computational units, in which case the memory pipelines are idle and can be used by CABA to speed up computation, e.g., by performing memoization using assist warps. We provide a comprehensive design and evaluation of CABA to perform effective and flexible data compression in the GPU memory hierarchy to alleviate the memory bandwidth bottleneck. Our extensive evaluations show that CABA, when used to implement data compression, provides an average performance improvement of 41.7% (as high as 2.6X) across a variety of memory-bandwidth-sensitive GPGPU applications.
@inproceedings{abc, abstract = {Modern Graphics Processing Units (GPUs) are well provisioned to support the concurrent execution of thousands of threads. Unfortunately, diUerent bottlenecks during execution and heterogeneous application requirements create imbalances in utilization of resources in the cores. For example, when a GPU is bottlenecked by the available on-chip memory bandwidth, its computational resources are often overwhelmingly idle, waiting for data from memory to arrive. This paper introduces the Core-Assisted Bottleneck Acceleration (CABA) framework that employs idle on-chip resources to alleviate different bottlenecks in GPU execution. CABA provides flexible mechanisms to automatically generate {\textquotedblleft}assist warps{\textquotedblright} that execute on GPU cores to perform specific tasks that can improve GPU performance and efficiency. CABA enables the use of idle computational units and pipelines to alleviate the memory bandwidth bottleneck, e.g., by using assist warps to perform data compression to transfer less data from memory. Conversely, the same framework can be employed to handle cases where the GPU is bottlenecked by the available computational units, in which case the memory pipelines are idle and can be used by CABA to speed up computation, e.g., by performing memoization using assist warps. We provide a comprehensive design and evaluation of CABA to perform effective and flexible data compression in the GPU memory hierarchy to alleviate the memory bandwidth bottleneck. Our extensive evaluations show that CABA, when used to implement data compression, provides an average performance improvement of 41.7\% (as high as 2.6X) across a variety of memory-bandwidth-sensitive GPGPU applications.}, author = {Nandita Vijaykumar and Gennady Pekhimenko and Adwait Jog and Abhishek Bhowmick and Rachata Ausavarungnirun and Chita R. Das and Mahmut T. Kandemir and Todd C. Mowry and Onur Mutlu}, booktitle = {Proceedings of the 42nd Annual International Symposium on Computer Architecture}, title = {A case for core-assisted bottleneck acceleration in GPUs: enabling flexible data compression with assist warps.}, url = {http://doi.acm.org/10.1145/2749469.2750399}, venue = {Portland, OR, USA}, year = {2015} }
Proceedings of the 42nd Annual International Symposium on Computer Architecture, Portland, OR, USA, June 2015
@inproceedings{abc, author = {Vivek Seshadri and Gennady Pekhimenko and Olatunji Ruwase and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry and Trishul M. Chilimbi}, booktitle = {Proceedings of the 42nd Annual International Symposium on Computer Architecture, Portland, OR, USA}, title = {Page overlays: an enhanced virtual memory framework to enable fine-grained memory management.}, url = {http://doi.acm.org/10.1145/2749469.2750379}, year = {2015} }
21st IEEE International Symposium on High Performance Computer Architecture, HPCA 2015, Burlingame, CA, USA, February 2015
@inproceedings{abc, author = {Gennady Pekhimenko and Tyler Huberty and Rui Cai and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {21st IEEE International Symposium on High Performance Computer Architecture, HPCA 2015, Burlingame, CA, USA}, title = {Exploiting compressed block size as an indicator of future reuse.}, url = {http://dx.doi.org/10.1109/HPCA.2015.7056021}, year = {2015} }
21st IEEE International Symposium on High Performance Computer Architecture, HPCA 2015, Burlingame, CA, USA, February 2015
@inproceedings{abc, author = {Donghyuk Lee and Yoongu Kim and Gennady Pekhimenko and Samira Manabi Khan and Vivek Seshadri and Kevin Kai-Wei Chang and Onur Mutlu}, booktitle = {21st IEEE International Symposium on High Performance Computer Architecture, HPCA 2015, Burlingame, CA, USA}, title = {Adaptive-latency DRAM: Optimizing DRAM timing for the common-case.}, url = {http://dx.doi.org/10.1109/HPCA.2015.7056057}, year = {2015} }
Bioinformatics, January 2015
@inproceedings{abc, author = {Hongyi Xin and John Greth and John Emmons and Gennady Pekhimenko and Carl Kingsford and Can Alkan and Onur Mutlu}, booktitle = {Bioinformatics}, title = {Shifted Hamming distance: a fast and accurate SIMD-friendly filter to accelerate alignment verification in read mapping.}, url = {http://dx.doi.org/10.1093/bioinformatics/btu856}, year = {2015} }
CoRR, January 2015
@article{abc, author = {Hongyi Xin and Richard Zhu and Sunny Nahar and John Emmons and Gennady Pekhimenko and Carl Kingsford and Can Alkan and Onur Mutlu}, journal = {CoRR}, title = {Optimal Seed Solver: Optimizing Seed Selection in Read Mapping.}, url = {http://arxiv.org/abs/1506.08235}, year = {2015} }
CoRR, January 2015
@article{abc, author = {Donghyuk Lee and Gennady Pekhimenko and Samira Manabi Khan and Saugata Ghose and Onur Mutlu}, journal = {CoRR}, title = {Simultaneous Multi Layer Access: A High Bandwidth and Low Cost 3D-Stacked Memory Interface.}, url = {http://arxiv.org/abs/1506.03160}, year = {2015} }
Computer Architecture Letters, January 2015
@inproceedings{abc, author = {Gennady Pekhimenko and Evgeny Bolotin and Mike O{\textquoteright}Connor and Onur Mutlu and Todd C. Mowry and Stephen W. Keckler}, booktitle = {Computer Architecture Letters}, title = {Toggle-Aware Compression for GPUs.}, url = {http://dx.doi.org/10.1109/LCA.2015.2430853}, year = {2015} }
2014
International Conference on Parallel Architectures and Compilation, PACT '14, Edmonton, AB, Canada, August 2014
@inproceedings{abc, author = {Bradley Thwaites and Gennady Pekhimenko and Hadi Esmaeilzadeh and Amir Yazdanbakhsh and Onur Mutlu and Jongse Park and Girish Mururu and Todd C. Mowry}, booktitle = {International Conference on Parallel Architectures and Compilation, PACT {\textquoteright}14, Edmonton, AB, Canada}, title = {Rollback-free value prediction with approximate loads.}, url = {http://doi.acm.org/10.1145/2628071.2628110}, year = {2014} }
2013
The 46th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO-46, Davis, CA, USA, December 2013
@inproceedings{abc, author = {Vivek Seshadri and Yoongu Kim and Chris Fallin and Donghyuk Lee and Rachata Ausavarungnirun and Gennady Pekhimenko and Yixin Luo and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {The 46th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO-46, Davis, CA, USA}, title = {RowClone: fast and energy-efficient in-DRAM bulk data copy and initialization.}, url = {http://doi.acm.org/10.1145/2540708.2540725}, year = {2013} }
The 46th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO-46, Davis, CA, USA, December 2013
@inproceedings{abc, author = {Gennady Pekhimenko and Vivek Seshadri and Yoongu Kim and Hongyi Xin and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {The 46th Annual IEEE/ACM International Symposium on Microarchitecture, MICRO-46, Davis, CA, USA}, title = {Linearly compressed pages: a low-complexity, low-latency main memory compression framework.}, url = {http://doi.acm.org/10.1145/2540708.2540724}, year = {2013} }
2012
International Conference on Parallel Architectures and Compilation Techniques, PACT '12, Minneapolis, MN, September 2012
@inproceedings{abc, author = {Gennady Pekhimenko and Todd C. Mowry and Onur Mutlu}, booktitle = {International Conference on Parallel Architectures and Compilation Techniques, PACT {\textquoteright}12, Minneapolis, MN}, title = {Linearly compressed pages: a main memory compression framework with low complexity and low latency.}, url = {http://doi.acm.org/10.1145/2370816.2370911}, year = {2012} }
International Conference on Parallel Architectures and Compilation Techniques, PACT '12, Minneapolis, MN, September 2012
@inproceedings{abc, author = {Gennady Pekhimenko and Vivek Seshadri and Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch and Todd C. Mowry}, booktitle = {International Conference on Parallel Architectures and Compilation Techniques, PACT {\textquoteright}12, Minneapolis, MN}, title = {Base-delta-immediate compression: practical data compression for on-chip caches.}, url = {http://doi.acm.org/10.1145/2370816.2370870}, year = {2012} }