Publications by Torsten Hoefler
2018
Proceedings of the 23rd International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS), Williamsburg, VA, USA, March 2018
Emerging chips with hundreds and thousands of cores require networks with unprecedented energy/area efficiency and scalability. To address this, we propose Slim NoC (SN): a new on-chip network design that delivers significant improvements in efficiency and scalability compared to the state-of-the-art. The key idea is to use two concepts from graph and number theory, degree-diameter graphs combined with non-prime finite fields, to enable the smallest number of ports for a given core count. SN is inspired by state-of-the-art off-chip topologies; it identifies and distills their advantages for NoC settings while solving several key issues that lead to significant overheads on-chip. SN provides NoC-specific layouts, which further enhance area/energy efficiency. We show how to augment SN with state-of-the-art router microarchitecture schemes such as Elastic Links, to make the network even more scalable and efficient. Our extensive experimental evaluations show that SN outperforms both traditional low-radix topologies (e.g., meshes and tori) and modern high-radix networks (e.g., various Flattened Butterflies) in area, latency, throughput, and static/dynamic power consumption for both synthetic and real workloads. SN provides a promising direction in scalable and energy-efficient NoC topologies.
@inproceedings{abc, abstract = {Emerging chips with hundreds and thousands of cores require networks with unprecedented energy/area efficiency and scalability. To address this, we propose Slim NoC (SN): a new on-chip network design that delivers significant improvements in efficiency and scalability compared to the state-of-the-art. The key idea is to use two concepts from graph and number theory, degree-diameter graphs combined with non-prime finite fields, to enable the smallest number of ports for a given core count. SN is inspired by state-of-the-art off-chip topologies; it identifies and distills their advantages for NoC settings while solving several key issues that lead to significant overheads on-chip. SN provides NoC-specific layouts, which further enhance area/energy efficiency. We show how to augment SN with state-of-the-art router microarchitecture schemes such as Elastic Links, to make the network even more scalable and efficient. Our extensive experimental evaluations show that SN outperforms both traditional low-radix topologies (e.g., meshes and tori) and modern high-radix networks (e.g., various Flattened Butterflies) in area, latency, throughput, and static/dynamic power consumption for both synthetic and real workloads. SN provides a promising direction in scalable and energy-efficient NoC topologies.}, author = {Maciej Besta and Syed M. Hassan and Sudhakar Yalamanchili and Rachata Ausavarungnirun and Onur Mutlu and Torsten Hoefler}, booktitle = {Proceedings of the 23rd International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)}, title = {Slim NoC: A Low-Diameter On-Chip Network Topology for High Energy Efficiency and Scalability}, venue = {Williamsburg, VA, USA}, year = {2018} }
2017
Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, Denver, CO, USA, November 2017
Betweenness centrality (BC) is a crucial graph problem that measures the significance of a vertex by the number of shortest paths leading through it. We propose Maximal Frontier Betweenness Centrality (MFBC): a succinct BC algorithm based on novel sparse matrix multiplication routines that performs a factor of p1/3 less communication on p processors than the best known alternatives, for graphs with n vertices and average degree k = n/p2/3. We formulate, implement, and prove the correctness of MFBC for weighted graphs by leveraging monoids instead of semirings, which enables a surprisingly succinct formulation. MFBC scales well for both extremely sparse and relatively dense graphs. It automatically searches a space of distributed data decompositions and sparse matrix multiplication algorithms for the most advantageous configuration. The MFBC implementation outperforms the well-known CombBLAS library by up to 8x and shows more robust performance. Our design methodology is readily extensible to other graph problems.
@inproceedings{abc, abstract = {Betweenness centrality (BC) is a crucial graph problem that measures the significance of a vertex by the number of shortest paths leading through it. We propose Maximal Frontier Betweenness Centrality (MFBC): a succinct BC algorithm based on novel sparse matrix multiplication routines that performs a factor of p1/3 less communication on p processors than the best known alternatives, for graphs with n vertices and average degree k = n/p2/3. We formulate, implement, and prove the correctness of MFBC for weighted graphs by leveraging monoids instead of semirings, which enables a surprisingly succinct formulation. MFBC scales well for both extremely sparse and relatively dense graphs. It automatically searches a space of distributed data decompositions and sparse matrix multiplication algorithms for the most advantageous configuration. The MFBC implementation outperforms the well-known CombBLAS library by up to 8x and shows more robust performance. Our design methodology is readily extensible to other graph problems.}, author = {Edgar Solomonik and Maciej Besta and Flavio Vella and Torsten Hoefler}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, title = {Scaling Betweenness Centrality using Communication-Efficient Sparse Matrix Multiplication}, venue = {Denver, CO, USA}, year = {2017} }
Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, Denver, CO, USA, November 2017
Optimizing communication performance is imperative for large-scale computing because communication overheads limit the strong scalability of parallel applications. Today's network cards contain rather powerful processors optimized for data movement. However, these devices are limited to fixed functions, such as remote direct memory access. We develop sPIN, a portable programming model to offload simple packet processing functions to the network card. To demonstrate the potential of the model, we design a cycle-accurate simulation environment by combining the network simulator LogGOPSim and the CPU simulator gem5. We implement offloaded message matching, datatype processing, and collective communications and demonstrate transparent full-application speedups. Furthermore, we show how sPIN can be used to accelerate redundant in-memory filesystems and several other use cases. Our work investigates a portable packet-processing network acceleration model similar to compute acceleration with CUDA or OpenCL. We show how such network acceleration enables an eco-system that can significantly speed up applications and system services.
@inproceedings{abc, abstract = {Optimizing communication performance is imperative for large-scale computing because communication overheads limit the strong scalability of parallel applications. Today{\textquoteright}s network cards contain rather powerful processors optimized for data movement. However, these devices are limited to fixed functions, such as remote direct memory access. We develop sPIN, a portable programming model to offload simple packet processing functions to the network card. To demonstrate the potential of the model, we design a cycle-accurate simulation environment by combining the network simulator LogGOPSim and the CPU simulator gem5. We implement offloaded message matching, datatype processing, and collective communications and demonstrate transparent full-application speedups. Furthermore, we show how sPIN can be used to accelerate redundant in-memory filesystems and several other use cases. Our work investigates a portable packet-processing network acceleration model similar to compute acceleration with CUDA or OpenCL. We show how such network acceleration enables an eco-system that can significantly speed up applications and system services.}, author = {Torsten Hoefler and Salvatore Di Girolamo and Konstantin Taranov and Ron Brightwell}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, title = {sPIN: High-performance streaming Processing in the Network}, venue = {Denver, CO, USA}, year = {2017} }
IEEE Transactions on Parallel and Distributed Systems, October 2017
The cost of data movement has always been an important concern in high performance computing (HPC) systems. It has now become the dominant factor in terms of both energy consumption and performance. Support for expression of data locality has been explored in the past, but those efforts have had only modest success in being adopted in HPC applications for various reasons. them However, with the increasing complexity of the memory hierarchy and higher parallelism in emerging HPC systems, locality management has acquired a new urgency. Developers can no longer limit themselves to low-level solutions and ignore the potential for productivity and performance portability obtained by using locality abstractions. Fortunately, the trend emerging in recent literature on the topic alleviates many of the concerns that got in the way of their adoption by application developers. Data locality abstractions are available in the forms of libraries, data structures, languages and runtime systems; a common theme is increasing productivity without sacrificing performance. This paper examines these trends and identifies commonalities that can combine various locality concepts to develop a comprehensive approach to expressing and managing data locality on future large-scale high-performance computing systems.
@article{abc, abstract = {The cost of data movement has always been an important concern in high performance computing (HPC) systems. It has now become the dominant factor in terms of both energy consumption and performance. Support for expression of data locality has been explored in the past, but those efforts have had only modest success in being adopted in HPC applications for various reasons. them However, with the increasing complexity of the memory hierarchy and higher parallelism in emerging HPC systems, locality management has acquired a new urgency. Developers can no longer limit themselves to low-level solutions and ignore the potential for productivity and performance portability obtained by using locality abstractions. Fortunately, the trend emerging in recent literature on the topic alleviates many of the concerns that got in the way of their adoption by application developers. Data locality abstractions are available in the forms of libraries, data structures, languages and runtime systems; a common theme is increasing productivity without sacrificing performance. This paper examines these trends and identifies commonalities that can combine various locality concepts to develop a comprehensive approach to expressing and managing data locality on future large-scale high-performance computing systems.}, author = {Didem Unat and Anshu Dubey and Torsten Hoefler and John Shalf and Mark Abraham and Mauro Bianco and Bradford L. Chamberlain and Romain Cledat and H. Carter Edwards and Hal Finkel and Karl Fuerlinger and Frank Hannig and Emmanuel Jeannot and Amir Kamil and Jeff Keasler and Paul H. J. Kelly and Vitus Leung and Hatem Ltaief and Naoya Maruyama and Chris J. Newburn and Miquel Pericas}, pages = {3007-3020}, journal = {IEEE Transactions on Parallel and Distributed Systems}, title = {Trends in Data Locality Abstractions for HPC Systems}, volume = {28}, year = {2017} }
Proceedings of the 25th Annual Symposium on High-Performance Interconnects (HOTI'17), Santa Clara, CA, USA, August 2017
Interconnection networks must meet the communication demands of current High-Performance Computing systems. In order to interconnect efficiently the end nodes of these systems with a good performance-to-cost ratio, new network topologies have been proposed in the last years that leverage high-radix switches, such as Slim Fly. Adversarial traffic patterns, however, may reduce severely the performance of Slim Fly networks when using only minimal-path routing. In order to mitigate the performance degradation in these scenarios, Slim Fly networks should configure an oblivious or adaptive non-minimal routing. The non-minimal routing algorithms proposed for Slim Fly usually rely on Valiant's algorithm to select the paths, at the cost of doubling the average path-length, as well as the number of Virtual Channels (VCs) required to prevent deadlocks. Moreover, Valiant may introduce additional inefficiencies when applied to Slim Fly networks, such as the "turn-around problem" that we analyze in this work. With the aim of overcoming these drawbacks, we propose in this paper two variants of the Valiant's algorithm that improve the non-minimal path selection in Slim Fly networks. They are designed to be combined with adaptive routing algorithms that rely on Valiant to select non-minimalpaths, such as UGAL or PAR, which we have adapted to the Slim Fly topology. Through the results from simulation experiments, we show that our proposals improve the network performance and/or reduce the number of required VCs to prevent deadlocks, even in scenarios with adversarial traffic.
@inproceedings{abc, abstract = {Interconnection networks must meet the communication demands of current High-Performance Computing systems. In order to interconnect efficiently the end nodes of these systems with a good performance-to-cost ratio, new network topologies have been proposed in the last years that leverage high-radix switches, such as Slim Fly. Adversarial traffic patterns, however, may reduce severely the performance of Slim Fly networks when using only minimal-path routing. In order to mitigate the performance degradation in these scenarios, Slim Fly networks should configure an oblivious or adaptive non-minimal routing. The non-minimal routing algorithms proposed for Slim Fly usually rely on Valiant{\textquoteright}s algorithm to select the paths, at the cost of doubling the average path-length, as well as the number of Virtual Channels (VCs) required to prevent deadlocks. Moreover, Valiant may introduce additional inefficiencies when applied to Slim Fly networks, such as the "turn-around problem" that we analyze in this work. With the aim of overcoming these drawbacks, we propose in this paper two variants of the Valiant{\textquoteright}s algorithm that improve the non-minimal path selection in Slim Fly networks. They are designed to be combined with adaptive routing algorithms that rely on Valiant to select non-minimalpaths, such as UGAL or PAR, which we have adapted to the Slim Fly topology. Through the results from simulation experiments, we show that our proposals improve the network performance and/or reduce the number of required VCs to prevent deadlocks, even in scenarios with adversarial traffic.}, author = {Pedro Yebenes and Jesus Escudero-Sahuquillo and Pedro Javier Garcia and Francisco J. Quiles and Torsten Hoefler}, booktitle = {Proceedings of the 25th Annual Symposium on High-Performance Interconnects (HOTI{\textquoteright}17)}, title = {Improving Non-Minimal and Adaptive Routing Algorithms in Slim Fly Networks}, venue = {Santa Clara, CA, USA}, year = {2017} }
Proceedings of the 25th Annual Symposium on High-Performance Interconnects (HOTI'17), Santa Clara, CA, USA, August 2017
The advent of non-volatile memory (NVM) technologies has added an interesting nuance to the node level memory hierarchy. With modern 100 Gb/s networks, the NVM tier of storage can often be slower than the high performance network in the system; thus, a new challenge arises in the datacenter. Whereas prior efforts have studied the impacts of multiple sources targeting one node (i.e., incast) and have studied multiple flows causing congestion in inter-switch links, it is now possible for a single flow from a single source to overwhelm the bandwidth of a key portion of the memory hierarchy. This can subsequently spread to the switches and lead to congestion trees in a flow-controlled network or excessive packet drops without flow control. In this work we describe protocols which avoid overwhelming the receiver in the case of a source/sink rate mismatch. We design our protocols on top of Portals 4, which enables us to make use of network offload. Our protocol yields up to 4× higher throughput in a 5k node Dragonfly topology for a permutation traffic pattern in which only 1% of all nodes have a memory write-bandwidth limitation of 1/8th of the network bandwidth.
@inproceedings{abc, abstract = {The advent of non-volatile memory (NVM) technologies has added an interesting nuance to the node level memory hierarchy. With modern 100 Gb/s networks, the NVM tier of storage can often be slower than the high performance network in the system; thus, a new challenge arises in the datacenter. Whereas prior efforts have studied the impacts of multiple sources targeting one node (i.e., incast) and have studied multiple flows causing congestion in inter-switch links, it is now possible for a single flow from a single source to overwhelm the bandwidth of a key portion of the memory hierarchy. This can subsequently spread to the switches and lead to congestion trees in a flow-controlled network or excessive packet drops without flow control. In this work we describe protocols which avoid overwhelming the receiver in the case of a source/sink rate mismatch. We design our protocols on top of Portals 4, which enables us to make use of network offload. Our protocol yields up to 4{\texttimes} higher throughput in a 5k node Dragonfly topology for a permutation traffic pattern in which only 1\% of all nodes have a memory write-bandwidth limitation of 1/8th of the network bandwidth.}, author = {Timo Schneider and James Dinan and Mario Flajslik and Keith D. Underwood and Torsten Hoefler}, booktitle = {Proceedings of the 25th Annual Symposium on High-Performance Interconnects (HOTI{\textquoteright}17)}, title = {Fast Networks and Slow Memories: A Mechanism for Mitigating Bandwidth Mismatches}, venue = {Santa Clara, CA, USA}, year = {2017} }
Proceedings of the 26th International Symposium on High-Performance Parallel and Distributed Computing, Washington, DC, USA, June 2017
Many distributed systems require coordination between the components involved. With the steady growth of such systems, the probability of failures increases, which necessitates scalable fault-tolerant agreement protocols. The most common practical agreement protocol, for such scenarios, is leader-based atomic broadcast. In this work, we propose AllConcur, a distributed system that provides agreement through a leaderless concurrent atomic broadcast algorithm, thus, not suffering from the bottleneck of a central coordinator. In AllConcur, all components exchange messages concurrently through a logical overlay network that employs early termination to minimize the agreement latency. Our implementation of AllConcur supports standard sockets-based TCP as well as high-performance InfiniBand Verbs communications. AllConcur can handle up to 135 million requests per second and achieves 17x higher throughput than today's standard leader-based protocols, such as Libpaxos. Thus, AllConcur is highly competitive with regard to existing solutions and, due to its decentralized approach, enables hitherto unattainable system designs in a variety of fields.
@inproceedings{abc, abstract = {Many distributed systems require coordination between the components involved. With the steady growth of such systems, the probability of failures increases, which necessitates scalable fault-tolerant agreement protocols. The most common practical agreement protocol, for such scenarios, is leader-based atomic broadcast. In this work, we propose AllConcur, a distributed system that provides agreement through a leaderless concurrent atomic broadcast algorithm, thus, not suffering from the bottleneck of a central coordinator. In AllConcur, all components exchange messages concurrently through a logical overlay network that employs early termination to minimize the agreement latency. Our implementation of AllConcur supports standard sockets-based TCP as well as high-performance InfiniBand Verbs communications. AllConcur can handle up to 135 million requests per second and achieves 17x higher throughput than today{\textquoteright}s standard leader-based protocols, such as Libpaxos. Thus, AllConcur is highly competitive with regard to existing solutions and, due to its decentralized approach, enables hitherto unattainable system designs in a variety of fields.}, author = {Marius Poke and Torsten Hoefler and Colin W. Glass}, booktitle = {Proceedings of the 26th International Symposium on High-Performance Parallel and Distributed Computing}, title = {AllConcur: Leaderless Concurrent Atomic Broadcast}, venue = {Washington, DC, USA}, year = {2017} }
Proceedings of the 26th International Symposium on High-Performance Parallel and Distributed Computing, Washington, DC, USA, June 2017
We reduce the cost of communication and synchronization in graph processing by analyzing the fastest way to process graphs: pushing the updates to a shared state or pulling the updates to a private state. We investigate the applicability of this push-pull dichotomy to various algorithms and its impact on complexity, performance, and the amount of used locks, atomics, and reads/writes. We consider 11 graph algorithms, 3 programming models, 2 graph abstractions, and various families of graphs. The conducted analysis illustrates surprising differences between push and pull variants of different algorithms in performance, speed of convergence, and code complexity; the insights are backed up by performance data from hardware counters. We use these findings to illustrate which variant is faster for each algorithm and to develop generic strategies that enable even higher speedups. Our insights can be used to accelerate graph processing engines or libraries on both massively-parallel shared-memory machines as well as distributed-memory systems.
@inproceedings{abc, abstract = {We reduce the cost of communication and synchronization in graph processing by analyzing the fastest way to process graphs: pushing the updates to a shared state or pulling the updates to a private state. We investigate the applicability of this push-pull dichotomy to various algorithms and its impact on complexity, performance, and the amount of used locks, atomics, and reads/writes. We consider 11 graph algorithms, 3 programming models, 2 graph abstractions, and various families of graphs. The conducted analysis illustrates surprising differences between push and pull variants of different algorithms in performance, speed of convergence, and code complexity; the insights are backed up by performance data from hardware counters. We use these findings to illustrate which variant is faster for each algorithm and to develop generic strategies that enable even higher speedups. Our insights can be used to accelerate graph processing engines or libraries on both massively-parallel shared-memory machines as well as distributed-memory systems.}, author = {Maciej Besta and Michal Podstawski and Linus Groner and Edgar Solomonik and Torsten Hoefler}, booktitle = {Proceedings of the 26th International Symposium on High-Performance Parallel and Distributed Computing}, title = {To Push or To Pull: On Reducing Communication and Synchronization in Graph Computations}, venue = {Washington, DC, USA}, year = {2017} }
Proceedings of the International Conference on Computational Science (ICCS'17), Zurich, Switzerland, June 2017
Designing a partial differential equations solver is a complex task which involves making choices about the solution algorithm and its parameters. Such choices are usually done on the basis of personal preference or numerical experiments, which can introduce significant bias on the selection process. In this work we develop a methodology to drive this selection process towards the optimal choices by modelling the accuracy and the performance of the solution algorithm. We show how this methodology can be successfully applied on the linear advection problem. As a result, the selection can be optimally performed with a much lower investment on the development of high-performance versions of the solvers and without using the target architecture for numerical experiments.
@inproceedings{abc, abstract = {Designing a partial differential equations solver is a complex task which involves making choices about the solution algorithm and its parameters. Such choices are usually done on the basis of personal preference or numerical experiments, which can introduce significant bias on the selection process. In this work we develop a methodology to drive this selection process towards the optimal choices by modelling the accuracy and the performance of the solution algorithm. We show how this methodology can be successfully applied on the linear advection problem. As a result, the selection can be optimally performed with a much lower investment on the development of high-performance versions of the solvers and without using the target architecture for numerical experiments.}, author = {Andrea Arteaga and Oliver Fuhrer and Torsten Hoefler and Thomas Schulthess}, booktitle = {Proceedings of the International Conference on Computational Science (ICCS{\textquoteright}17)}, title = {Model-Driven Choice of Numerical Methods for the Solution of the Linear Advection Equation}, venue = {Zurich, Switzerland}, year = {2017} }
Proceedings of the 29th ACM Symposium on Parallelism in Algorithms and Architectures, Washington, DC, USA, June 2017
Many large-scale scientific computations require eigenvalue solvers in a scaling regime where efficiency is limited by data movement. We introduce a parallel algorithm for computing the eigenvalues of a dense symmetric matrix, which performs asymptotically less communication than previously known approaches. We provide analysis in the Bulk Synchronous Parallel (BSP) model with additional consideration for communication between a local memory and cache. Given sufficient memory to store c copies of the symmetric matrix, our algorithm requires \Theta(\sqrt{c}) less interprocessor communication than previously known algorithms, for any c\leq p^{1/3} when using p processors. The algorithm first reduces the dense symmetric matrix to a banded matrix with the same eigenvalues. Subsequently, the algorithm employs successive reduction to O(\log p) thinner banded matrices. We employ two new parallel algorithms that achieve lower communication costs for the full-to-band and band-to-band reductions. Both of these algorithms leverage a novel QR factorization algorithm for rectangular matrices.
@inproceedings{abc, abstract = {Many large-scale scientific computations require eigenvalue solvers in a scaling regime where efficiency is limited by data movement. We introduce a parallel algorithm for computing the eigenvalues of a dense symmetric matrix, which performs asymptotically less communication than previously known approaches. We provide analysis in the Bulk Synchronous Parallel (BSP) model with additional consideration for communication between a local memory and cache. Given sufficient memory to store c copies of the symmetric matrix, our algorithm requires \Theta(\sqrt{c}) less interprocessor communication than previously known algorithms, for any c\leq p^{1/3} when using p processors. The algorithm first reduces the dense symmetric matrix to a banded matrix with the same eigenvalues. Subsequently, the algorithm employs successive reduction to O(\log p) thinner banded matrices. We employ two new parallel algorithms that achieve lower communication costs for the full-to-band and band-to-band reductions. Both of these algorithms leverage a novel QR factorization algorithm for rectangular matrices.}, author = {Edgar Solomonik and Grey Ballard and James Demmel and Torsten Hoefler}, booktitle = {Proceedings of the 29th ACM Symposium on Parallelism in Algorithms and Architectures}, title = {A Communication-Avoiding Parallel Algorithm for the Symmetric Eigenvalue Problem}, venue = {Washington, DC, USA}, year = {2017} }
Proceedings of the Algorithms and Complexity - 10th International Conference, CIAC 2017, Athens, Greece, May 2017
We investigate the multi-agent pathfinding (MAPF) problem with n agents on graphs with n vertices: Each agent has a unique start and goal vertex, with the objective of moving all agents in parallel movements to their goal s.t. each vertex and each edge may only be used by one agent at a time. We give a combinatorial classification of all graphs where this problem is solvable in general, including cases where the solvability depends on the initial agent placement.
Furthermore, we present an algorithm solving the MAPF problem in our setting, requiring O(n2)O(n2) rounds, or O(n3)O(n3) moves of individual agents. Complementing these results, we show that there are graphs where Ω(n2)Ω(n2) rounds and Ω(n3)Ω(n3) moves are required for any algorithm.
@inproceedings{abc, abstract = {We investigate the multi-agent pathfinding (MAPF) problem with n agents on graphs with n vertices: Each agent has a unique start and goal vertex, with the objective of moving all agents in parallel movements to their goal s.t. each vertex and each edge may only be used by one agent at a time. We give a combinatorial classification of all graphs where this problem is solvable in general, including cases where the solvability depends on the initial agent placement. Furthermore, we present an algorithm solving the MAPF problem in our setting, requiring O(n2)O(n2) rounds, or O(n3)O(n3) moves of individual agents. Complementing these results, we show that there are graphs where {\textohm}(n2){\textohm}(n2) rounds and {\textohm}(n3){\textohm}(n3) moves are required for any algorithm.}, author = {Klaus-Tycho Foerster and Linus Groner and Torsten Hoefler and Michael Koenig and Sascha Schmid and Roger Wattenhofer}, booktitle = {Proceedings of the Algorithms and Complexity - 10th International Conference, CIAC 2017}, title = {Multi-agent Pathfinding with n Agents on Graphs with n Vertices: Combinatorial Classification and Tight Algorithmic Bounds}, venue = {Athens, Greece}, year = {2017} }
Proceedings of the 31st IEEE International Parallel & Distributed Processing Symposium (IPDPS'17), Orlando, FL, USA, May 2017
The constantly increasing gap between communication and computation performance emphasizes the importance of communication-avoidance techniques. Caching is a well-known concept used to reduce accesses to slow local memories. In this work, we extend the caching idea to MPI-3 Remote Memory Access (RMA) operations. Here, caching can avoid inter-node communications and achieve similar benefits for irregular applications as communication-avoiding algorithms for structured applications. We propose CLaMPI, a caching library layered on top of MPI-3 RMA, to automatically optimize code with minimum user intervention. We demonstrate how cached RMA improves the performance of a Barnes Hut simulation and a Local Clustering Coefficient computation up to a factor of 1.8x and 5x, respectively. Due to the low overheads in the cache miss case and the potential benefits, we expect that our ideas around transparent RMA caching will soon be an integral part of many MPI libraries.
@inproceedings{abc, abstract = {The constantly increasing gap between communication and computation performance emphasizes the importance of communication-avoidance techniques. Caching is a well-known concept used to reduce accesses to slow local memories. In this work, we extend the caching idea to MPI-3 Remote Memory Access (RMA) operations. Here, caching can avoid inter-node communications and achieve similar benefits for irregular applications as communication-avoiding algorithms for structured applications. We propose CLaMPI, a caching library layered on top of MPI-3 RMA, to automatically optimize code with minimum user intervention. We demonstrate how cached RMA improves the performance of a Barnes Hut simulation and a Local Clustering Coefficient computation up to a factor of 1.8x and 5x, respectively. Due to the low overheads in the cache miss case and the potential benefits, we expect that our ideas around transparent RMA caching will soon be an integral part of many MPI libraries.}, author = {Salvatore Di Girolamo and Flavio Vella and Torsten Hoefler}, booktitle = {Proceedings of the 31st IEEE International Parallel \& Distributed Processing Symposium (IPDPS{\textquoteright}17)}, title = {Transparent Caching for RMA Systems}, venue = {Orlando, FL, USA}, year = {2017} }
Proceedings of the 31st IEEE International Parallel & Distributed Processing Symposium (IPDPS'17), Orlando, FL, USA, May 2017
Large-scale parallel programming environments and algorithms require efficient group-communication on computing systems with failing nodes. Existing reliable broadcast algorithms either cannot guarantee that all nodes are reached or are very expensive in terms of the number of messages and latency. This paper proposes Corrected-Gossip, a method that combines Monte Carlo style gossiping with a deterministic correction phase, to construct a Las Vegas style reliable broadcast that guarantees reaching all the nodes at low cost. We analyze the performance of this method both analytically and by simulations and show how it reduces the latency and network load compared to existing algorithms. Our method improves the latency by 20% and the network load by 53% compared to the fastest known algorithm on 4,096 nodes. We believe that the principle of corrected-gossip opens an avenue for many other reliable group communication operations.
@inproceedings{abc, abstract = {Large-scale parallel programming environments and algorithms require efficient group-communication on computing systems with failing nodes. Existing reliable broadcast algorithms either cannot guarantee that all nodes are reached or are very expensive in terms of the number of messages and latency. This paper proposes Corrected-Gossip, a method that combines Monte Carlo style gossiping with a deterministic correction phase, to construct a Las Vegas style reliable broadcast that guarantees reaching all the nodes at low cost. We analyze the performance of this method both analytically and by simulations and show how it reduces the latency and network load compared to existing algorithms. Our method improves the latency by 20\% and the network load by 53\% compared to the fastest known algorithm on 4,096 nodes. We believe that the principle of corrected-gossip opens an avenue for many other reliable group communication operations.}, author = {Torsten Hoefler and Amnon Barak and Amnon Shiloh and }, booktitle = {Proceedings of the 31st IEEE International Parallel \& Distributed Processing Symposium (IPDPS{\textquoteright}17)}, title = {Corrected Gossip Algorithms for Fast Reliable Broadcast on Unreliable Systems}, venue = {Orlando, FL, USA}, year = {2017} }
Proceedings of the 31st IEEE International Parallel & Distributed Processing Symposium (IPDPS'17), Orlando, FL, USA, May 2017
We present a new parallel algorithm for solving triangular systems with multiple right hand sides (TRSM). TRSM is used extensively in numerical linear algebra computations, both to solve triangular linear systems of equations as well as to compute factorizations with triangular matrices, such as Cholesky, LU, and QR. Our algorithm achieves better theoretical scalability than known alternatives, while maintaining numerical stability, via selective use of triangular matrix inversion. We leverage the fact that triangular inversion and matrix multiplication are more parallelizable than the standard TRSM algorithm. By only inverting triangular blocks along the diagonal of the initial matrix, we generalize the usual way of TRSM computation and the full matrix inversion approach. This flexibility leads to an efficient algorithm for any ratio of the number of right hand sides to the triangular matrix dimension. We provide a detailed communication cost analysis for our algorithm as well as for the recursive triangular matrix inversion. This cost analysis makes it possible to determine optimal block sizes and processor grids a priori. Relative to the best known algorithms for TRSM, our approach can require asymptotically fewer messages, while performing optimal amounts of computation and communication in terms of words sent.
@inproceedings{abc, abstract = {We present a new parallel algorithm for solving triangular systems with multiple right hand sides (TRSM). TRSM is used extensively in numerical linear algebra computations, both to solve triangular linear systems of equations as well as to compute factorizations with triangular matrices, such as Cholesky, LU, and QR. Our algorithm achieves better theoretical scalability than known alternatives, while maintaining numerical stability, via selective use of triangular matrix inversion. We leverage the fact that triangular inversion and matrix multiplication are more parallelizable than the standard TRSM algorithm. By only inverting triangular blocks along the diagonal of the initial matrix, we generalize the usual way of TRSM computation and the full matrix inversion approach. This flexibility leads to an efficient algorithm for any ratio of the number of right hand sides to the triangular matrix dimension. We provide a detailed communication cost analysis for our algorithm as well as for the recursive triangular matrix inversion. This cost analysis makes it possible to determine optimal block sizes and processor grids a priori. Relative to the best known algorithms for TRSM, our approach can require asymptotically fewer messages, while performing optimal amounts of computation and communication in terms of words sent.}, author = {Tobias Wicky and Edgar Solomonik and Torsten Hoefler}, booktitle = {Proceedings of the 31st IEEE International Parallel \& Distributed Processing Symposium (IPDPS{\textquoteright}17)}, title = {Communication-Avoiding Parallel Algorithms for Solving Triangular Systems of Linear Equations}, venue = {Orlando, FL, USA}, year = {2017} }
Proceedings of the 31st IEEE International Parallel & Distributed Processing Symposium (IPDPS'17), Orlando, FL, USA, May 2017
Increasingly complex memory systems and onchip interconnects are developed to mitigate the data movement bottlenecks in manycore processors. One example of such a complex system is the Xeon Phi KNL CPU with three different types of memory, fifteen memory configuration options, and a complex on-chip mesh network connecting up to 72 cores. Users require a detailed understanding of the performance characteristics of the different options to utilize the system efficiently. Unfortunately, peak performance is rarely achievable and achievable performance is hardly documented. We address this with capability models of the memory subsystem, derived by systematic measurements, to guide users to navigate the complex optimization space. As a case study, we provide an extensive model of all memory configuration options for Xeon Phi KNL. We demonstrate how our capability model can be used to automatically derive new close-to-optimal algorithms for various communication functions yielding improvements 5x and 24x over Intel's tuned OpenMP and MPI implementations, respectively. Furthermore, we demonstrate how to use the models to assess how efficiently a bitonic sort application utilizes the memory resources. Interestingly, our capability models predict and explain that the high bandwidthMCDRAM does not improve the bitonic sort performance over DRAM.
@inproceedings{abc, abstract = {Increasingly complex memory systems and onchip interconnects are developed to mitigate the data movement bottlenecks in manycore processors. One example of such a complex system is the Xeon Phi KNL CPU with three different types of memory, fifteen memory configuration options, and a complex on-chip mesh network connecting up to 72 cores. Users require a detailed understanding of the performance characteristics of the different options to utilize the system efficiently. Unfortunately, peak performance is rarely achievable and achievable performance is hardly documented. We address this with capability models of the memory subsystem, derived by systematic measurements, to guide users to navigate the complex optimization space. As a case study, we provide an extensive model of all memory configuration options for Xeon Phi KNL. We demonstrate how our capability model can be used to automatically derive new close-to-optimal algorithms for various communication functions yielding improvements 5x and 24x over Intel{\textquoteright}s tuned OpenMP and MPI implementations, respectively. Furthermore, we demonstrate how to use the models to assess how efficiently a bitonic sort application utilizes the memory resources. Interestingly, our capability models predict and explain that the high bandwidthMCDRAM does not improve the bitonic sort performance over DRAM.}, author = {Sabela Ramos and Torsten Hoefler}, booktitle = {Proceedings of the 31st IEEE International Parallel \& Distributed Processing Symposium (IPDPS{\textquoteright}17)}, title = {Capability Models for Manycore Memory Systems: A Case-Study with Xeon Phi KNL}, venue = {Orlando, FL, USA}, year = {2017} }
Proceedings of the 22nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming, Austin, TX, USA, February 2017
Task-based programming offers an elegant way to express units of computation and the dependencies among them, making it easier to distribute the computational load evenly across multiple cores. However, this separation of problem decomposition and parallelism requires a sufficiently large input problem to achieve satisfactory efficiency on a given number of cores. Unfortunately, finding a good match between input size and core count usually requires significant experimentation, which is expensive and sometimes even impractical. In this paper, we propose an automated empirical method for finding the isoefficiency function of a task-based program, binding efficiency, core count, and the input size in one analytical expression. This allows the latter two to be adjusted according to given (realistic) efficiency objectives. Moreover, we not only find (i) the actual isoefficiency function but also (ii) the function one would yield if the program execution was free of resource contention and (iii) an upper bound that could only be reached if the program was able to maintain its average parallelism throughout its execution. The difference between the three helps to explain low efficiency, and in particular, it helps to differentiate between resource contention and structural conflicts related to task dependencies or scheduling. The insights gained can be used to co-design programs and shared system resources.
@inproceedings{abc, abstract = {Task-based programming offers an elegant way to express units of computation and the dependencies among them, making it easier to distribute the computational load evenly across multiple cores. However, this separation of problem decomposition and parallelism requires a sufficiently large input problem to achieve satisfactory efficiency on a given number of cores. Unfortunately, finding a good match between input size and core count usually requires significant experimentation, which is expensive and sometimes even impractical. In this paper, we propose an automated empirical method for finding the isoefficiency function of a task-based program, binding efficiency, core count, and the input size in one analytical expression. This allows the latter two to be adjusted according to given (realistic) efficiency objectives. Moreover, we not only find (i) the actual isoefficiency function but also (ii) the function one would yield if the program execution was free of resource contention and (iii) an upper bound that could only be reached if the program was able to maintain its average parallelism throughout its execution. The difference between the three helps to explain low efficiency, and in particular, it helps to differentiate between resource contention and structural conflicts related to task dependencies or scheduling. The insights gained can be used to co-design programs and shared system resources.}, author = {Sergei Shudler and Alexandru Calotoiu and Torsten Hoefler and Felix Wolf}, booktitle = {Proceedings of the 22nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming}, title = {Isoefficiency in Practice: Configuring and Understanding the Performance of Task-based Applications}, venue = {Austin, TX, USA}, year = {2017} }
PVLDB, January 2017
@inproceedings{abc, author = {Claude Barthels and Gustavo Alonso and Torsten Hoefler and Timo Schneider and Ingo M{\"u}ller}, booktitle = {PVLDB}, title = {Distributed Join Algorithms on Thousands of Cores.}, url = {http://www.vldb.org/pvldb/vol10/p517-barthels.pdf}, year = {2017} }
IEEE Data Eng. Bull., January 2017
High-throughput, low-latency networks are becoming a key element in database appliances and data processing systems to reduce the overhead of data movement. In this article, we focus on Remote Direct Memory Access (RDMA), a feature increasingly available in modern networks enabling the network card to directly write to and read from main memory. RDMA has started to attract attention as a technical solution to quite a few performance bottlenecks in distributed data management but there is still much work to be done to make it an effective technology suitable for database engines. In this article, we identify several advantages and drawbacks of RDMA and related technologies, and propose new communication primitives that would bridge the gap between the operations provided by high-speed networks and the needs of data processing systems.
@article{abc, abstract = {High-throughput, low-latency networks are becoming a key element in database appliances and data processing systems to reduce the overhead of data movement. In this article, we focus on Remote Direct Memory Access (RDMA), a feature increasingly available in modern networks enabling the network card to directly write to and read from main memory. RDMA has started to attract attention as a technical solution to quite a few performance bottlenecks in distributed data management but there is still much work to be done to make it an effective technology suitable for database engines. In this article, we identify several advantages and drawbacks of RDMA and related technologies, and propose new communication primitives that would bridge the gap between the operations provided by high-speed networks and the needs of data processing systems.}, author = {Claude Barthels and Gustavo Alonso and Torsten Hoefler}, journal = {IEEE Data Eng. Bull.}, title = {Designing Databases for Future High-Performance Networks.}, url = {http://sites.computer.org/debull/A17mar/p15.pdf}, year = {2017} }
2016
Proceedings of the IEEE International Conference on Cluster Computing (CLUSTER) , Taipei, Taiwan, December 2016
Tuning large applications requires a clever exploration of the design and configuration space. Especially on supercomputers, this space is so large that its exhaustive traversal via performance experiments becomes too expensive, if not impossible. Manually creating analytical performance models provides insights into optimization opportunities but is extremely laborious if done for applications of realistic size. If we must consider multiple performance-relevant parameters and their possible interactions, a common requirement, this task becomes even more complex. We build on previous work on automatic scalability modeling and significantly extend it to allow insightful modeling of any combination of application execution parameters. Multi-parameter modeling has so far been outside the reach of automatic methods due to the exponential growth of the model search space. We develop a new technique to traverse the search space rapidly and generate insightful performance models that enable a wide range of uses from performance predictions for balanced machine design to performance tuning.
@inproceedings{abc, abstract = {Tuning large applications requires a clever exploration of the design and configuration space. Especially on supercomputers, this space is so large that its exhaustive traversal via performance experiments becomes too expensive, if not impossible. Manually creating analytical performance models provides insights into optimization opportunities but is extremely laborious if done for applications of realistic size. If we must consider multiple performance-relevant parameters and their possible interactions, a common requirement, this task becomes even more complex. We build on previous work on automatic scalability modeling and significantly extend it to allow insightful modeling of any combination of application execution parameters. Multi-parameter modeling has so far been outside the reach of automatic methods due to the exponential growth of the model search space. We develop a new technique to traverse the search space rapidly and generate insightful performance models that enable a wide range of uses from performance predictions for balanced machine design to performance tuning.}, author = {Alexandru Calotoiu and David Beckinsale and Christopher W. Earl and Torsten Hoefler and Ian Karlin and Martin Schulz and Felix Wolf}, booktitle = {Proceedings of the IEEE International Conference on Cluster Computing (CLUSTER) }, title = {Fast Multi-Parameter Performance Modeling}, venue = {Taipei, Taiwan}, year = {2016} }
Proceedings of the IEEE 24th Annual Symposium on High-Performance Interconnects (HOTI), Santa Clara, CA, USA, December 2016
Lossless networks, such as InfiniBand use flow-control to avoid packet-loss due to congestion. This introduces dependencies between input and output channels, in case of cyclic dependencies the network can deadlock. Deadlocks can be resolved by splitting a physical channel into multiple virtual channels with independent buffers and credit systems. Currently available routing engines for InfiniBand assign entire paths from source to destination nodes to different virtual channels. However, InfiniBand allows changing the virtual channel at every switch. We developed fast routing engines which make use of that fact and map individual hops to virtual channels. Our algorithm imposes a total order on virtual channels and increments the virtual channel at every hop, thus the diameter of the network is an upper bound for the required number of virtual channels. We integrated this algorithm into the InfiniBand software stack. Our algorithms provide deadlock free routing on state-of-the-art low-diameter topologies, using fewer virtual channels than currently available practical approaches, while being faster by a factor of four on large networks. Since low-diameter topologies are common among the largest supercomputers in the world, to provide deadlock-free routing for such systems is very important.
@inproceedings{abc, abstract = {Lossless networks, such as InfiniBand use flow-control to avoid packet-loss due to congestion. This introduces dependencies between input and output channels, in case of cyclic dependencies the network can deadlock. Deadlocks can be resolved by splitting a physical channel into multiple virtual channels with independent buffers and credit systems. Currently available routing engines for InfiniBand assign entire paths from source to destination nodes to different virtual channels. However, InfiniBand allows changing the virtual channel at every switch. We developed fast routing engines which make use of that fact and map individual hops to virtual channels. Our algorithm imposes a total order on virtual channels and increments the virtual channel at every hop, thus the diameter of the network is an upper bound for the required number of virtual channels. We integrated this algorithm into the InfiniBand software stack. Our algorithms provide deadlock free routing on state-of-the-art low-diameter topologies, using fewer virtual channels than currently available practical approaches, while being faster by a factor of four on large networks. Since low-diameter topologies are common among the largest supercomputers in the world, to provide deadlock-free routing for such systems is very important.}, author = {Timo Schneider and Otto Bibartiu and Torsten Hoefler}, booktitle = {Proceedings of the IEEE 24th Annual Symposium on High-Performance Interconnects (HOTI)}, title = {Ensuring Deadlock-Freedom in Low-Diameter InfiniBand Networks}, venue = {Santa Clara, CA, USA}, year = {2016} }
Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, Salt Lake City, UT, USA, November 2016
MeteoSwiss, the Swiss national weather forecast institute, has selected densely populated accelerator servers as their primary system to compute weather forecast simulation. Servers with multiple accelerator devices that are primarily connected by a PCI-Express (PCIe) network achieve a significantly higher energy efficiency. Memory transfers between accelerators in such a system are subjected to PCIe arbitration policies. In this paper, we study the impact of PCIe topology and develop a congestion-aware performance model for PCIe communication. We present an algorithm for computing congestion factors of every communication in a congestion graph that characterizes the dynamic usage of network resources by an application. Our model applies to any PCIe tree topology. Our validation results on two different topologies of 8 GPU devices demonstrate that our model achieves an accuracy of over 97% within the PCIe network. We demonstrate the model on a weather forecast application to identify the best algorithms for its communication patterns among GPUs.
@inproceedings{abc, abstract = {MeteoSwiss, the Swiss national weather forecast institute, has selected densely populated accelerator servers as their primary system to compute weather forecast simulation. Servers with multiple accelerator devices that are primarily connected by a PCI-Express (PCIe) network achieve a significantly higher energy efficiency. Memory transfers between accelerators in such a system are subjected to PCIe arbitration policies. In this paper, we study the impact of PCIe topology and develop a congestion-aware performance model for PCIe communication. We present an algorithm for computing congestion factors of every communication in a congestion graph that characterizes the dynamic usage of network resources by an application. Our model applies to any PCIe tree topology. Our validation results on two different topologies of 8 GPU devices demonstrate that our model achieves an accuracy of over 97\% within the PCIe network. We demonstrate the model on a weather forecast application to identify the best algorithms for its communication patterns among GPUs.}, author = {Maxime Martinasso and Grzegorz Kwasniewski and Sadaf R. Alam}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, title = {A PCIe congestion-aware performance model for densely populated accelerator servers}, venue = {Salt Lake City, UT, USA}, year = {2016} }
Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, Salt Lake City, UT, USA, November 2016
The goal of the extreme scale plasma turbulence studies described in this paper is to expedite the delivery of reliable predictions on confinement physics in large magnetic fusion systems by using world-class supercomputers to carry out simulations with unprecedented resolution and temporal duration. This has involved architecture-dependent optimizations of performance scaling and addressing code portability and energy issues, with the metrics for multi-platform comparisons being "time-to-solution" and "energy-to-solution". Realistic results addressing how confinement losses caused by plasma turbulence scale from present-day devices to the much larger $25 billion international ITER fusion facility have been enabled by innovative advances in the GTC-P code including (i) implementation of one-sided communication from MPI 3.0 standard; (ii) creative optimization techniques on Xeon Phi processors; and (iii) development of a novel performance model for the key kernels of the PIC code. Results show that modeling data movement is sufficient to predict performance on modern supercomputer platforms.
@inproceedings{abc, abstract = {The goal of the extreme scale plasma turbulence studies described in this paper is to expedite the delivery of reliable predictions on confinement physics in large magnetic fusion systems by using world-class supercomputers to carry out simulations with unprecedented resolution and temporal duration. This has involved architecture-dependent optimizations of performance scaling and addressing code portability and energy issues, with the metrics for multi-platform comparisons being "time-to-solution" and "energy-to-solution". Realistic results addressing how confinement losses caused by plasma turbulence scale from present-day devices to the much larger $25 billion international ITER fusion facility have been enabled by innovative advances in the GTC-P code including (i) implementation of one-sided communication from MPI 3.0 standard; (ii) creative optimization techniques on Xeon Phi processors; and (iii) development of a novel performance model for the key kernels of the PIC code. Results show that modeling data movement is sufficient to predict performance on modern supercomputer platforms.}, author = {William Tang and Stephane Ethier and Grzegorz Kwasniewski and Torsten Hoefler and Khaled Z. Ibrahim and Kamesh Madduri and Samuel Williams and Leonid Oliker and Carlos Rosales-Fernandez and Tim Williams}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, title = {Extreme Scale Plasma Turbulence Simulations on Top Supercomputers Worldwide}, venue = {Salt Lake City, UT, USA}, year = {2016} }
Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, Salt Lake City, UT, USA, November 2016
The interconnection network has a large influence on total cost, application performance, energy consumption, and overall system efficiency of a supercomputer. Unfortunately, today's routing algorithms do not utilize this important resource most efficiently. We first demonstrate this by defining the dark fiber metric as a measure of unused resource in networks. To improve the utilization, we propose scheduling-aware routing, a new technique that uses the current state of the batch system to determine a new set of network routes and so increases overall system utilization by up to 17.74%. We also show that our proposed routing increases the throughput of communication benchmarks by up to 17.6% on a practical InfiniBand installation. Our routing method is implemented in the standard InfiniBand tool set and can immediately be used to optimize systems. In fact, we are using it to improve the utilization of our production petascale supercomputer for more than one year.
@inproceedings{abc, abstract = {The interconnection network has a large influence on total cost, application performance, energy consumption, and overall system efficiency of a supercomputer. Unfortunately, today{\textquoteright}s routing algorithms do not utilize this important resource most efficiently. We first demonstrate this by defining the dark fiber metric as a measure of unused resource in networks. To improve the utilization, we propose scheduling-aware routing, a new technique that uses the current state of the batch system to determine a new set of network routes and so increases overall system utilization by up to 17.74\%. We also show that our proposed routing increases the throughput of communication benchmarks by up to 17.6\% on a practical InfiniBand installation. Our routing method is implemented in the standard InfiniBand tool set and can immediately be used to optimize systems. In fact, we are using it to improve the utilization of our production petascale supercomputer for more than one year.}, author = {Jens Domke and Torsten Hoefler}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, title = {Scheduling-Aware Routing for Supercomputers}, venue = {Salt Lake City, UT, USA}, year = {2016} }
Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, Salt Lake City, UT, USA, November 2016
Over the last decade, CUDA and the underlying GPU hardware architecture have continuously gained popularity in various high-performance computing application domains such as climate modeling, computational chemistry, or machine learning. Despite this popularity, we lack a single coherent programming model for GPU clusters. We therefore introduce the dCUDA programming model, which implements device-side remote memory access with target notification. To hide instruction pipeline latencies, CUDA programs over-decompose the problem and over-subscribe the device by running many more threads than there are hardware execution units. Whenever a thread stalls, the hardware scheduler immediately proceeds with the execution of another thread ready for execution. This latency hiding technique is key to make best use of the available hardware resources. With dCUDA, we apply latency hiding at cluster scale to automatically overlap computation and communication. Our benchmarks demonstrate perfect overlap for memory bandwidth-bound tasks and good overlap for compute-bound tasks.
@inproceedings{abc, abstract = {Over the last decade, CUDA and the underlying GPU hardware architecture have continuously gained popularity in various high-performance computing application domains such as climate modeling, computational chemistry, or machine learning. Despite this popularity, we lack a single coherent programming model for GPU clusters. We therefore introduce the dCUDA programming model, which implements device-side remote memory access with target notification. To hide instruction pipeline latencies, CUDA programs over-decompose the problem and over-subscribe the device by running many more threads than there are hardware execution units. Whenever a thread stalls, the hardware scheduler immediately proceeds with the execution of another thread ready for execution. This latency hiding technique is key to make best use of the available hardware resources. With dCUDA, we apply latency hiding at cluster scale to automatically overlap computation and communication. Our benchmarks demonstrate perfect overlap for memory bandwidth-bound tasks and good overlap for compute-bound tasks.}, author = {Tobias Gysi and Jeremia B{\"a}r and Torsten Hoefler}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis}, title = {dCUDA: Hardware Supported Overlap of Computation and Communication}, venue = {Salt Lake City, UT, USA}, year = {2016} }
Proceedings of the 2016 ACM SIGPLAN International Conference on Object-Oriented Programming, Systems, Languages, and Applications, Amsterdam, Netherlands, November 2016
Recent advances in networking hardware have led to a new generation of Remote Memory Access (RMA) networks in which processors from different machines can communicate directly, bypassing the operating system and allowing higher performance. Researchers and practitioners have proposed libraries and programming models for RMA to enable the development of applications running on these networks,
However, the memory models implied by these RMA libraries and languages are often loosely specified, poorly understood, and differ depending on the underlying network architecture and other factors. Hence, it is difficult to precisely reason about the semantics of RMA programs or how changes in the network architecture affect them.
We address this problem with the following contributions: (i) a coreRMA language which serves as a common foundation, formalizing the essential characteristics of RMA programming; (ii) complete axiomatic semantics for that language; (iii) integration of our semantics with an existing constraint solver, enabling us to exhaustively generate coreRMA programs (litmus tests) up to a specified bound and check whether the tests satisfy their specification; and (iv) extensive validation of our semantics on real-world RMA systems. We generated and ran 7441 litmus tests using each of the low-level RMA network APIs: DMAPP, VPI Verbs, and Portals 4. Our results confirmed that our model successfully captures behaviors exhibited by these networks. Moreover, we found RMA programs that behave inconsistently with existing documentation, confirmed by network experts.
Our work provides an important step towards understanding existing RMA networks, thus influencing the design of future RMA interfaces and hardware.
@inproceedings{abc, abstract = {Recent advances in networking hardware have led to a new generation of Remote Memory Access (RMA) networks in which processors from different machines can communicate directly, bypassing the operating system and allowing higher performance. Researchers and practitioners have proposed libraries and programming models for RMA to enable the development of applications running on these networks, However, the memory models implied by these RMA libraries and languages are often loosely specified, poorly understood, and differ depending on the underlying network architecture and other factors. Hence, it is difficult to precisely reason about the semantics of RMA programs or how changes in the network architecture affect them. We address this problem with the following contributions: (i) a coreRMA language which serves as a common foundation, formalizing the essential characteristics of RMA programming; (ii) complete axiomatic semantics for that language; (iii) integration of our semantics with an existing constraint solver, enabling us to exhaustively generate coreRMA programs (litmus tests) up to a specified bound and check whether the tests satisfy their specification; and (iv) extensive validation of our semantics on real-world RMA systems. We generated and ran 7441 litmus tests using each of the low-level RMA network APIs: DMAPP, VPI Verbs, and Portals 4. Our results confirmed that our model successfully captures behaviors exhibited by these networks. Moreover, we found RMA programs that behave inconsistently with existing documentation, confirmed by network experts. Our work provides an important step towards understanding existing RMA networks, thus influencing the design of future RMA interfaces and hardware.}, author = {Andrei Marian Dan and Patrick Lam and Torsten Hoefler and Martin Vechev}, booktitle = {Proceedings of the 2016 ACM SIGPLAN International Conference on Object-Oriented Programming, Systems, Languages, and Applications}, title = {Modeling and Analysis of Remote Memory Access Programming}, venue = {Amsterdam, Netherlands}, year = {2016} }
IEEE Micro, July 2016
Network interface cards are one of the key components to achieve efficient parallel performance. In the past, they have gained new functionalities, such as lossless transmission and remote direct memory access, that are now ubiquitous in high-performance systems. Prototypes of next-generation network cards now offer new features that facilitate device programming. In this article, the authors discuss an abstract machine model for offloading architectures. They used the Portals 4 network interface to implement the proposed abstraction model, and they present two microbenchmarks to show the effects of fully offloaded collective communications. They then propose the concept of persistent offloaded operations that can reduce the creation/offloading overhead, and they discuss a possible extension to the current Portals 4 interface to enable their support. The results obtained show how this work can be used to accelerate existing MPI applications.
@article{abc, abstract = {Network interface cards are one of the key components to achieve efficient parallel performance. In the past, they have gained new functionalities, such as lossless transmission and remote direct memory access, that are now ubiquitous in high-performance systems. Prototypes of next-generation network cards now offer new features that facilitate device programming. In this article, the authors discuss an abstract machine model for offloading architectures. They used the Portals 4 network interface to implement the proposed abstraction model, and they present two microbenchmarks to show the effects of fully offloaded collective communications. They then propose the concept of persistent offloaded operations that can reduce the creation/offloading overhead, and they discuss a possible extension to the current Portals 4 interface to enable their support. The results obtained show how this work can be used to accelerate existing MPI applications.}, author = {Salvatore Di Girolamo and Pierre Jolivet and Keith D. Underwood and Torsten Hoefler}, pages = {6-17}, journal = {IEEE Micro}, title = {Exploiting Offload Enabled Network Interfaces}, volume = {36}, year = {2016} }
Proceedings of the 25th ACM International Symposium on High-Performance Parallel and Distributed Computing, Kyoto, Japan, June 2016
Lossless interconnection networks are omnipresent in high performance computing systems, data centers and network-on-chip architectures. Such networks require efficient and deadlock-free routing functions to utilize the available hardware. Topology-aware routing functions become increasingly inapplicable, due to irregular topologies, which either are irregular by design or as a result of hardware failures. Existing topology-agnostic routing methods either suffer from poor load balancing or are not bounded in the number of virtual channels needed to resolve deadlocks in the routing tables. We propose a novel topology-agnostic routing approach which implicitly avoids deadlocks during the path calculation instead of solving both problems separately. We present a model implementation, called Nue, of a destination-based and oblivious routing function. Nue routing heuristically optimizes the load balancing while enforcing deadlock-freedom without exceeding a given number of virtual channels, which we demonstrate based on the InfiniBand architecture.
@inproceedings{abc, abstract = {Lossless interconnection networks are omnipresent in high performance computing systems, data centers and network-on-chip architectures. Such networks require efficient and deadlock-free routing functions to utilize the available hardware. Topology-aware routing functions become increasingly inapplicable, due to irregular topologies, which either are irregular by design or as a result of hardware failures. Existing topology-agnostic routing methods either suffer from poor load balancing or are not bounded in the number of virtual channels needed to resolve deadlocks in the routing tables. We propose a novel topology-agnostic routing approach which implicitly avoids deadlocks during the path calculation instead of solving both problems separately. We present a model implementation, called Nue, of a destination-based and oblivious routing function. Nue routing heuristically optimizes the load balancing while enforcing deadlock-freedom without exceeding a given number of virtual channels, which we demonstrate based on the InfiniBand architecture.}, author = {Jens Domke and Torsten Hoefler and Satoshi Matsuoka}, booktitle = {Proceedings of the 25th ACM International Symposium on High-Performance Parallel and Distributed Computing}, title = {Routing on the Dependency Graph: A New Approach to Deadlock-Free High-Performance Routing}, venue = {Kyoto, Japan}, year = {2016} }
Proceedings of the 25th ACM International Symposium on High-Performance Parallel and Distributed Computing, Kyoto, Japan, June 2016
We propose a topology-aware distributed Reader-Writer lock that accelerates irregular workloads for supercomputers and data centers. The core idea behind the lock is a modular design that is an interplay of three distributed data structures: a counter of readers/writers in the critical section, a set of queues for ordering writers waiting for the lock, and a tree that binds all the queues and synchronizes writers with readers. Each structure is associated with a parameter for favoring either readers or writers, enabling adjustable performance that can be viewed as a point in a three dimensional parameter space. We also develop a distributed topology-aware MCS lock that is a building block of the above design and improves state-of-the-art MPI implementations. Both schemes use non-blocking Remote Memory Access (RMA) techniques for highest performance and scalability. We evaluate our schemes on a Cray XC30 and illustrate that they outperform state-of-the-art MPI-3 RMA locking protocols by 81% and 73%, respectively. Finally, we use them to accelerate a distributed hashtable that represents irregular workloads such as key-value stores or graph processing.
@inproceedings{abc, abstract = {We propose a topology-aware distributed Reader-Writer lock that accelerates irregular workloads for supercomputers and data centers. The core idea behind the lock is a modular design that is an interplay of three distributed data structures: a counter of readers/writers in the critical section, a set of queues for ordering writers waiting for the lock, and a tree that binds all the queues and synchronizes writers with readers. Each structure is associated with a parameter for favoring either readers or writers, enabling adjustable performance that can be viewed as a point in a three dimensional parameter space. We also develop a distributed topology-aware MCS lock that is a building block of the above design and improves state-of-the-art MPI implementations. Both schemes use non-blocking Remote Memory Access (RMA) techniques for highest performance and scalability. We evaluate our schemes on a Cray XC30 and illustrate that they outperform state-of-the-art MPI-3 RMA locking protocols by 81\% and 73\%, respectively. Finally, we use them to accelerate a distributed hashtable that represents irregular workloads such as key-value stores or graph processing.}, author = {Patrick Schmid and Maciej Besta and Torsten Hoefler}, booktitle = {Proceedings of the 25th ACM International Symposium on High-Performance Parallel and Distributed Computing}, title = {High-Performance Distributed RMA Locks}, venue = {Kyoto, Japan}, year = {2016} }
Proceedings of the 2016 International Conference on Supercomputing, Istanbul, Turkey, June 2016
Programming today's increasingly complex heterogeneous hardware is difficult, as it commonly requires the use of data-parallel languages, pragma annotations, specialized libraries, or DSL compilers. Adding explicit accelerator support into a larger code base is not only costly, but also introduces additional complexity that hinders long-term maintenance. We propose a new heterogeneous compiler that brings us closer to the dream of automatic accelerator mapping. Starting from a sequential compiler IR, we automatically generate a hybrid executable that - in combination with a new data management system - transparently offloads suitable code regions. Our approach is almost regression free for a wide range of applications while improving a range of compute kernels as well as two full SPEC CPU applications. We expect our work to reduce the initial cost of accelerator usage and to free developer time to investigate algorithmic changes.
@inproceedings{abc, abstract = {Programming today{\textquoteright}s increasingly complex heterogeneous hardware is difficult, as it commonly requires the use of data-parallel languages, pragma annotations, specialized libraries, or DSL compilers. Adding explicit accelerator support into a larger code base is not only costly, but also introduces additional complexity that hinders long-term maintenance. We propose a new heterogeneous compiler that brings us closer to the dream of automatic accelerator mapping. Starting from a sequential compiler IR, we automatically generate a hybrid executable that - in combination with a new data management system - transparently offloads suitable code regions. Our approach is almost regression free for a wide range of applications while improving a range of compute kernels as well as two full SPEC CPU applications. We expect our work to reduce the initial cost of accelerator usage and to free developer time to investigate algorithmic changes.}, author = {Tobias Grosser and Torsten Hoefler}, booktitle = {Proceedings of the 2016 International Conference on Supercomputing}, title = {Polly-ACC: Transparent compilation to heterogeneous hardware}, venue = {Istanbul, Turkey}, year = {2016} }
Proceedings of the Platform for Advanced Scientific Computing Conference, Lausanne, Switzerland, June 2016
We discuss the paper selection process of the ACM PASC16 conference. The conference spans multiple scientific fields used to very different publication cultures. We aim to combine the strengths of the conference and journal publication schemes in order to design an attractive high-quality publication venue for works in large-scale computational science. We use four non-standard key ideas to design a paper selection process for ACM PASC16: (1) no pre-selected committee, (2) short revision process, (3) full double-blindness, and (4) suggested expert reviews. In this overview, we describe our observations of the process and anlayse the data in an attempt to characterize the effectiveness of the mechanisms used. We believe that the adoption of some or all of these ideas could prove beneficial beyond ACM PASC16.
@inproceedings{abc, abstract = {We discuss the paper selection process of the ACM PASC16 conference. The conference spans multiple scientific fields used to very different publication cultures. We aim to combine the strengths of the conference and journal publication schemes in order to design an attractive high-quality publication venue for works in large-scale computational science. We use four non-standard key ideas to design a paper selection process for ACM PASC16: (1) no pre-selected committee, (2) short revision process, (3) full double-blindness, and (4) suggested expert reviews. In this overview, we describe our observations of the process and anlayse the data in an attempt to characterize the effectiveness of the mechanisms used. We believe that the adoption of some or all of these ideas could prove beneficial beyond ACM PASC16.}, author = {Torsten Hoefler}, booktitle = {Proceedings of the Platform for Advanced Scientific Computing Conference}, title = {Selecting Technical Papers for an Interdisciplinary Conference: The PASC Review Process}, venue = {Lausanne, Switzerland}, year = {2016} }
The International Journal of High Performance Computing Applications, February 2016
Relaxed synchronization offers the potential for maintaining application scalability, by allowing many processes to make independent progress when some processes suffer delays. Yet the benefits of this approach for important parallel workloads have not been investigated in detail. In this paper, we use a validated simulation approach to explore the noise-mitigation effects of idealized nonblocking collectives, in workloads where these collectives are a major contributor to total execution time. Although nonblocking collectives are unlikely to provide significant noise mitigation to applications in the low operating system noise environments expected in next-generation high-performance computing systems, we show that they can potentially improve application runtime with respect to other noise types.
@article{abc, abstract = {Relaxed synchronization offers the potential for maintaining application scalability, by allowing many processes to make independent progress when some processes suffer delays. Yet the benefits of this approach for important parallel workloads have not been investigated in detail. In this paper, we use a validated simulation approach to explore the noise-mitigation effects of idealized nonblocking collectives, in workloads where these collectives are a major contributor to total execution time. Although nonblocking collectives are unlikely to provide significant noise mitigation to applications in the low operating system noise environments expected in next-generation high-performance computing systems, we show that they can potentially improve application runtime with respect to other noise types.}, author = {Patrick M. Widener and Scott Levy and Kurt B. Ferreira and Torsten Hoefler}, pages = {121-133}, journal = {The International Journal of High Performance Computing Applications}, title = {On noise and the performance benefit of nonblocking collectives}, volume = {30}, year = {2016} }
IEEE Transactions on Parallel and Distributed Systems, January 2016
The increase in the number of cores per processor and the complexity of memory hierarchies make cache coherence key for programmability of current shared memory systems. However, ignoring its detailed architectural characteristics can harm performance significantly. In order to assist performance-centric programming, we propose a methodology to allow semi-automatic performance tuning with the systematic translation from an algorithm to an analytic performance model for cache line transfers. For this, we design a simple interface for cache line aware optimization, a translation methodology, and a full performance model that exposes the block-based design of caches to middleware designers. We investigate two different architectures to show the applicability of our techniques and methods: the many-core accelerator Intel Xeon Phi and a multi-core processor with a NUMA configuration (Intel Sandy Bridge). We use mathematical optimization techniques to tune synchronization algorithms to the microarchitectures, identifying three techniques to design and optimize data transfers in our model: single-use, single-step broadcast, and private cache lines.
@article{abc, abstract = {The increase in the number of cores per processor and the complexity of memory hierarchies make cache coherence key for programmability of current shared memory systems. However, ignoring its detailed architectural characteristics can harm performance significantly. In order to assist performance-centric programming, we propose a methodology to allow semi-automatic performance tuning with the systematic translation from an algorithm to an analytic performance model for cache line transfers. For this, we design a simple interface for cache line aware optimization, a translation methodology, and a full performance model that exposes the block-based design of caches to middleware designers. We investigate two different architectures to show the applicability of our techniques and methods: the many-core accelerator Intel Xeon Phi and a multi-core processor with a NUMA configuration (Intel Sandy Bridge). We use mathematical optimization techniques to tune synchronization algorithms to the microarchitectures, identifying three techniques to design and optimize data transfers in our model: single-use, single-step broadcast, and private cache lines.}, author = {Sabela Ramos and Torsten Hoefler}, pages = {2824-2837}, journal = {IEEE Transactions on Parallel and Distributed Systems}, title = {Cache Line Aware Algorithm Design for Cache-Coherent Architectures}, volume = {27}, year = {2016} }
2015
Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015, Austin, TX, USA, November 2015
@inproceedings{abc, author = {Georgios Kathareios and Cyriel Minkenberg and Bogdan Prisacari and Germ{\'a}n Rodr{\'\i}guez and Torsten Hoefler}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015, Austin, TX, USA}, title = {Cost-effective diameter-two topologies: analysis and evaluation.}, url = {http://doi.acm.org/10.1145/2807591.2807652}, year = {2015} }
Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015, Austin, TX, USA, November 2015
@inproceedings{abc, author = {Torsten Hoefler and Roberto Belli}, booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC 2015, Austin, TX, USA}, title = {Scientific benchmarking of parallel computing systems: twelve ways to tell the masses when reporting performance results.}, url = {http://doi.acm.org/10.1145/2807591.2807644}, year = {2015} }
TOPC, July 2015
@article{abc, author = {Torsten Hoefler and James Dinan and Rajeev Thakur and Brian W. Barrett and Pavan Balaji and William Gropp and Keith D. Underwood}, journal = {TOPC}, title = {Remote Memory Access Programming in MPI-3.}, url = {http://doi.acm.org/10.1145/2780584}, year = {2015} }
Proceedings of the 29th ACM on International Conference on Supercomputing, ICS'15, Newport Beach/Irvine, CA, USA, June 2015
@inproceedings{abc, author = {Tobias Gysi and Tobias Grosser and Torsten Hoefler}, booktitle = {Proceedings of the 29th ACM on International Conference on Supercomputing, ICS{\textquoteright}15, Newport Beach/Irvine, CA, USA}, title = {MODESTO: Data-centric Analytic Optimization of Complex Stencil Programs on Heterogeneous Architectures.}, url = {http://doi.acm.org/10.1145/2751205.2751223}, year = {2015} }
Proceedings of the 29th ACM on International Conference on Supercomputing, ICS'15, Newport Beach/Irvine, CA, USA, June 2015
@inproceedings{abc, author = {Maciej Besta and Torsten Hoefler}, booktitle = {Proceedings of the 29th ACM on International Conference on Supercomputing, ICS{\textquoteright}15, Newport Beach/Irvine, CA, USA}, title = {Active Access: A Mechanism for High-Performance Distributed Data-Centric Computations.}, url = {http://doi.acm.org/10.1145/2751205.2751219}, year = {2015} }
Proceedings of the 29th ACM on International Conference on Supercomputing, ICS'15, Newport Beach/Irvine, CA, USA, June 2015
@inproceedings{abc, author = {Sergei Shudler and Alexandru Calotoiu and Torsten Hoefler and Alexandre Strube and Felix Wolf}, booktitle = {Proceedings of the 29th ACM on International Conference on Supercomputing, ICS{\textquoteright}15, Newport Beach/Irvine, CA, USA}, title = {Exascaling Your Library: Will Your Implementation Meet Your Expectations?}, url = {http://doi.acm.org/10.1145/2751205.2751216}, year = {2015} }
Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2015, Portland, OR, USA, June 2015
@inproceedings{abc, author = {Maciej Besta and Torsten Hoefler}, booktitle = {Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2015, Portland, OR, USA}, title = {Accelerating Irregular Computations with Hardware Transactional Memory and Active Messages.}, url = {http://doi.acm.org/10.1145/2749246.2749263}, year = {2015} }
Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2015, Portland, OR, USA, June 2015
@inproceedings{abc, author = {Sabela Ramos and Torsten Hoefler}, booktitle = {Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2015, Portland, OR, USA}, title = {Cache Line Aware Optimizations for ccNUMA Systems.}, url = {http://doi.acm.org/10.1145/2749246.2749256}, year = {2015} }
Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2015, Portland, OR, USA, June 2015
@inproceedings{abc, author = {Marius Poke and Torsten Hoefler}, booktitle = {Proceedings of the 24th International Symposium on High-Performance Parallel and Distributed Computing, HPDC 2015, Portland, OR, USA}, title = {DARE: High-Performance State Machine Replication on RDMA Networks.}, url = {http://doi.acm.org/10.1145/2749246.2749267}, year = {2015} }
The 10th International Conference on Future Internet, CFI '15, Seoul, Republic of Korea, June 2015
@inproceedings{abc, author = {Taeho Lee and Christos Pappas and Cristina Basescu and Jun Han and Torsten Hoefler and Adrian Perrig}, booktitle = {The 10th International Conference on Future Internet, CFI {\textquoteright}15, Seoul, Republic of Korea}, title = {Source-Based Path Selection: The Data Plane Perspective.}, url = {http://doi.acm.org/10.1145/2775088.2775090}, year = {2015} }
15th Workshop on Hot Topics in Operating Systems, HotOS XV, Kartause Ittingen, Switzerland, May 2015
@inproceedings{abc, author = {Torsten Hoefler and Robert B. Ross and Timothy Roscoe}, booktitle = {15th Workshop on Hot Topics in Operating Systems, HotOS XV, Kartause Ittingen, Switzerland}, title = {Distributing the Data Plane for Remote Storage Access.}, url = {https://www.usenix.org/conference/hotos15/workshop-program/presentation/hoefler}, year = {2015} }
2015 IEEE International Parallel and Distributed Processing Symposium, IPDPS 2015, Hyderabad, India, May 2015
@inproceedings{abc, author = {Roberto Belli and Torsten Hoefler}, booktitle = {2015 IEEE International Parallel and Distributed Processing Symposium, IPDPS 2015, Hyderabad, India}, title = {Notified Access: Extending Remote Memory Access Programming Models for Producer-Consumer Synchronization.}, url = {http://dx.doi.org/10.1109/IPDPS.2015.30}, year = {2015} }
2015 IEEE International Parallel and Distributed Processing Symposium Workshop, IPDPS 2015, Hyderabad, India, May 2015
@inproceedings{abc, author = {Torsten Hoefler and Laxmikant V. Kale}, booktitle = {2015 IEEE International Parallel and Distributed Processing Symposium Workshop, IPDPS 2015, Hyderabad, India}, title = {HIPS-LSPP Keynotes.}, url = {http://dx.doi.org/10.1109/IPDPSW.2015.173}, year = {2015} }
IJHPCA, February 2015
@article{abc, author = {Kamil Iskra and Torsten Hoefler}, journal = {IJHPCA}, title = {Operating systems and runtime environments on supercomputers.}, url = {http://dx.doi.org/10.1177/1094342014560666}, year = {2015} }