Publications by Chris Wilkerson
2017
Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture, Cambridge, MA, USA, October 2017
DRAM cells in close proximity can fail depending on the data content in neighboring cells. These failures are called data-dependent failures. Detecting and mitigating these failures online, while the system is running in the field, enables various optimizations that improve reliability, latency, and energy efficiency of the system. For example, a system can improve performance and energy efficiency by using a lower refresh rate for most cells and mitigate the failing cells using higher refresh rates or error correcting codes. All these system optimizations depend on accurately detecting every possible data-dependent failure that could occur with any content in DRAM. Unfortunately, detecting all data-dependent failures requires the knowledge of DRAM internals specific to each DRAM chip. As internal DRAM architecture is not exposed to the system, detecting data-dependent failures at the system-level is a major challenge.
In this paper, we decouple the detection and mitigation of data-dependent failures from physical DRAM organization such that it is possible to detect failures without knowledge of DRAM internals. To this end, we propose MEMCON, a memory content-based detection and mitigation mechanism for data-dependent failures in DRAM. MEMCON does not detect every possible data-dependent failure. Instead, it detects and mitigates failures that occur only with the current content in memory while the programs are running in the system. Such a mechanism needs to detect failures whenever there is a write access that changes the content of memory. As detection of failure with a runtime testing has a high overhead, MEMCON selectively initiates a test on a write, only when the time between two consecutive writes to that page (i.e., write interval) is long enough to provide significant benefit by lowering the refresh rate during that interval. MEMCON builds upon a simple, practical mechanism that predicts the long write intervals based on our observation that the write intervals in real workloads follow a Pareto distribution: the longer a page remains idle after a write, the longer it is expected to remain idle.
Our evaluation shows that compared to a system that uses an aggressive refresh rate, MEMCON reduces refresh operations by 65--74%, leading to a 10%/17%/40% (min) to 12%/22%/50% (max) performance improvement for a single-core and 10%/23%/52% (min) to 17%/29%/65% (max) performance improvement for a 4-core system using 8/16/32 Gb DRAM chips.
@inproceedings{abc, abstract = {DRAM cells in close proximity can fail depending on the data content in neighboring cells. These failures are called data-dependent failures. Detecting and mitigating these failures online, while the system is running in the field, enables various optimizations that improve reliability, latency, and energy efficiency of the system. For example, a system can improve performance and energy efficiency by using a lower refresh rate for most cells and mitigate the failing cells using higher refresh rates or error correcting codes. All these system optimizations depend on accurately detecting every possible data-dependent failure that could occur with any content in DRAM. Unfortunately, detecting all data-dependent failures requires the knowledge of DRAM internals specific to each DRAM chip. As internal DRAM architecture is not exposed to the system, detecting data-dependent failures at the system-level is a major challenge. In this paper, we decouple the detection and mitigation of data-dependent failures from physical DRAM organization such that it is possible to detect failures without knowledge of DRAM internals. To this end, we propose MEMCON, a memory content-based detection and mitigation mechanism for data-dependent failures in DRAM. MEMCON does not detect every possible data-dependent failure. Instead, it detects and mitigates failures that occur only with the current content in memory while the programs are running in the system. Such a mechanism needs to detect failures whenever there is a write access that changes the content of memory. As detection of failure with a runtime testing has a high overhead, MEMCON selectively initiates a test on a write, only when the time between two consecutive writes to that page (i.e., write interval) is long enough to provide significant benefit by lowering the refresh rate during that interval. MEMCON builds upon a simple, practical mechanism that predicts the long write intervals based on our observation that the write intervals in real workloads follow a Pareto distribution: the longer a page remains idle after a write, the longer it is expected to remain idle. Our evaluation shows that compared to a system that uses an aggressive refresh rate, MEMCON reduces refresh operations by 65--74\%, leading to a 10\%/17\%/40\% (min) to 12\%/22\%/50\% (max) performance improvement for a single-core and 10\%/23\%/52\% (min) to 17\%/29\%/65\% (max) performance improvement for a 4-core system using 8/16/32 Gb DRAM chips.}, author = {Samira Manabi Khan and Chris Wilkerson and Zhe Wang and Alaa R. Alameldeen and Donghyuk Lee and Onur Mutlu}, booktitle = {Proceedings of the 50th Annual IEEE/ACM International Symposium on Microarchitecture}, title = {Detecting and mitigating data-dependent DRAM failures by exploiting current memory content}, venue = {Cambridge, MA, USA}, year = {2017} }
2016
CoRR, January 2016
@article{abc, author = {Kevin Kai-Wei Chang and Donghyuk Lee and Zeshan Chishti and Alaa R. Alameldeen and Chris Wilkerson and Yoongu Kim and Onur Mutlu}, journal = {CoRR}, title = {Reducing Performance Impact of DRAM Refresh by Parallelizing Refreshes with Accesses.}, url = {http://arxiv.org/abs/1601.06352}, year = {2016} }
CoRR, January 2016
@article{abc, author = {Yoongu Kim and Ross Daly and Jeremie Kim and Chris Fallin and Ji-Hye Lee and Donghyuk Lee and Chris Wilkerson and Konrad Lai and Onur Mutlu}, journal = {CoRR}, title = {RowHammer: Reliability Analysis and Security Implications.}, url = {http://arxiv.org/abs/1603.00747}, year = {2016} }
2014
32nd IEEE International Conference on Computer Design, ICCD 2014, Seoul, South Korea, October 2014
@inproceedings{abc, author = {Chris Fallin and Chris Wilkerson and Onur Mutlu}, booktitle = {32nd IEEE International Conference on Computer Design, ICCD 2014, Seoul, South Korea}, title = {The heterogeneous block architecture.}, url = {http://dx.doi.org/10.1109/ICCD.2014.6974710}, year = {2014} }
ACM/IEEE 41st International Symposium on Computer Architecture, ISCA 2014, Minneapolis, MN, USA, June 2014
@inproceedings{abc, author = {Yoongu Kim and Ross Daly and Jeremie Kim and Chris Fallin and Ji-Hye Lee and Donghyuk Lee and Chris Wilkerson and Konrad Lai and Onur Mutlu}, booktitle = {ACM/IEEE 41st International Symposium on Computer Architecture, ISCA 2014, Minneapolis, MN, USA}, title = {Flipping bits in memory without accessing them: An experimental study of DRAM disturbance errors.}, url = {http://dx.doi.org/10.1109/ISCA.2014.6853210}, year = {2014} }
ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems, SIGMETRICS '14, Austin, TX, June 2014
@inproceedings{abc, author = {Samira Manabi Khan and Donghyuk Lee and Yoongu Kim and Alaa R. Alameldeen and Chris Wilkerson and Onur Mutlu}, booktitle = {ACM SIGMETRICS / International Conference on Measurement and Modeling of Computer Systems, SIGMETRICS {\textquoteright}14, Austin, TX}, title = {The efficacy of error mitigation techniques for DRAM retention failures: a comparative experimental study.}, url = {http://doi.acm.org/10.1145/2591971.2592000}, year = {2014} }
20th IEEE International Symposium on High Performance Computer Architecture, HPCA 2014, Orlando, FL, USA, February 2014
@inproceedings{abc, author = {Samira Manabi Khan and Alaa R. Alameldeen and Chris Wilkerson and Onur Mutlu and Daniel A. Jim{\'e}nez}, booktitle = {20th IEEE International Symposium on High Performance Computer Architecture, HPCA 2014, Orlando, FL, USA}, title = {Improving cache performance using read-write partitioning.}, url = {http://dx.doi.org/10.1109/HPCA.2014.6835954}, year = {2014} }
20th IEEE International Symposium on High Performance Computer Architecture, HPCA 2014, Orlando, FL, USA, February 2014
@inproceedings{abc, author = {Kevin Kai-Wei Chang and Donghyuk Lee and Zeshan Chishti and Alaa R. Alameldeen and Chris Wilkerson and Yoongu Kim and Onur Mutlu}, booktitle = {20th IEEE International Symposium on High Performance Computer Architecture, HPCA 2014, Orlando, FL, USA}, title = {Improving DRAM performance by parallelizing refreshes with accesses.}, url = {http://dx.doi.org/10.1109/HPCA.2014.6835946}, year = {2014} }
2013
The 40th Annual International Symposium on Computer Architecture, ISCA'13, Tel-Aviv, Israel, June 2013
@inproceedings{abc, author = {Jamie Liu and Ben Jaiyen and Yoongu Kim and Chris Wilkerson and Onur Mutlu}, booktitle = {The 40th Annual International Symposium on Computer Architecture, ISCA{\textquoteright}13, Tel-Aviv, Israel}, title = {An experimental study of data retention behavior in modern DRAM devices: implications for retention time profiling mechanisms.}, url = {http://doi.acm.org/10.1145/2485922.2485928}, year = {2013} }
2003
HPCA, January 2003
@inproceedings{abc, author = {Onur Mutlu and Jared Stark and Chris Wilkerson and Yale N. Patt}, booktitle = {HPCA}, title = {Runahead Execution: An Alternative to Very Large Instruction Windows for Out-of-Order Processors.}, url = {http://computer.org/proceedings/hpca/1871/18710129abs.htm}, year = {2003} }
IEEE Micro, January 2003
@inproceedings{abc, author = {Onur Mutlu and Jared Stark and Chris Wilkerson and Yale N. Patt}, booktitle = {IEEE Micro}, title = {Runahead Execution: An Effective Alternative to Large Instruction Windows.}, url = {http://csdl.computer.org/comp/mags/mi/2003/06/m6020abs.htm}, year = {2003} }