@article{do-ccpe-2022,author={Do, Tu Mai Anh and Pottier, Lo\"ic and {Ferreira da Silva}, Rafael and Ca\'{\i}no-Lores, Silvina and Taufer, Michela and Deelman, Ewa},title={Performance assessment of ensembles of in situ workflows under resource constraints},journal={Concurrency and Computation: Practice and Experience},year={2022},doi={10.1002/cpe.7111},keywords={mine,notalpha,isi},note={Funding Acknowledgments: NSF 1664162},}
@article{coleman-fgcs-2022,title={WfCommons: A framework for enabling scientific workflow research and development},journal={Future Generation Computer Systems},volume={128},pages={16-27},year={2022},issn={0167-739X},doi={10.1016/j.future.2021.09.043},author={Coleman, Tain\~a and Casanova, Henri and Pottier, Lo{\"i}c and Kaushik, Manav and Deelman, Ewa and {Ferreira da Silva}, Rafael},keywords={mine,isi},note={Funding Acknowledgments: NSF 1923539},}
@inproceedings{casanova-jssp-2022,title={On the Feasibility of Simulation-driven Portfolio Scheduling for Cyberinfrastructure Runtime Systems},author={Casanova, Henri and Ching Wong, Yick and Pottier, Lo\"ic and {Ferreira da Silva}, Rafael},booktitle={Job Scheduling Strategies for Parallel Processing (JSSPP)},doi={},pages={To appear},year={2022},publisher={Springer Nature},note={Funding Acknowledgments: NSF 2106059 and 2106147, DOE DE-AC05-00OR22725},keywords={mine,notalpha,isi},}
@inproceedings{pottier-ccgrid-2022,title={Accelerating Scientific Workflows on HPC Platforms with In Situ Processing},author={Do, Tu Mai Anh and Pottier, Lo\"ic and Yildiz, Orcun and Vahi, Karan and Krawczuk, Patrycja and Peterka, Tom and Deelman, Ewa},booktitle={IEEE/ACM 22nd International Symposium on Cluster, Cloud and Internet Computing (CCGrid)},author+an={1=jointfirst;2=jointfirst},doi={10.1109/CCGrid54584.2022.00009},pages={1--10},year={2022},organization={IEEE},note={Funding Acknowledgments: NSF 1664162, DOE DE-AC02-06CH11357, DE-AC02-05CH11231, DE-SC0012636 and DE-SC0022328},keywords={mine,notalpha,isi},addendum={\textcolor{brown}{\emph{The highlighted authors are joint first authors with equal contributions.}}},}
@article{do2021jocs,title={A Lightweight Method for Evaluating In Situ Workflow Efficiency},author={Do, Tu Mai Anh and Pottier, Lo{\"i}c and Ca\'ino-Lores, Silvina and {Ferreira da Silva}, Rafael and Cuendet, Michel A. and Weinstein, Harel and Estrada, Trilce and Taufer, Michela and Deelman, Ewa},journal={Journal of Computational Science},volume={48},number={},year={2021},doi={10.1016/j.jocs.2020.101259},keywords={mine,isi},note={Funding Acknowledgments: NSF 1741040, DOE DE-SC0012636},}
@inproceedings{krawczuk-works-2021,title={A Performance Characterization of Scientific Machine Learning Workflows},author={Krawczuk, Patrycja and Papadimitriou, George and Tanaka, Ryan and Do, Tu Mai Anh and Subramany, Srujana and Nagarkar, Shubham and Jain, Aditi and Lam, Kelsie and Mandal, Anirban and Pottier, Loïc and Deelman, Ewa},booktitle={2021 IEEE/ACM Workflows in Support of Large-Scale Science (WORKS)},year={2021},pages={58-65},keywords={mine,workshop,isi},doi={10.1109/WORKS54523.2021.00013},note={Funding Acknowledgments: DOE DE-SC0012636, NSF 1664162},}
@inproceedings{do2021p2s2,author={Do, Tu Mai Anh and Pottier, Loïc and {Ferreira da Silva}, Rafael and Ca\'{\i}no-Lores, Silvina and Taufer, Michela and Deelman, Ewa},title={Assessing Resource Provisioning and Allocation of Ensembles of In Situ Workflows},year={2021},isbn={9781450384414},publisher={Association for Computing Machinery},address={New York, NY, USA},doi={10.1145/3458744.3474051},booktitle={50th International Conference on Parallel Processing Workshop},articleno={38},numpages={10},location={Lemont, IL, USA},series={ICPP Workshops '21},keywords={mine,workshop,isi},note={Funding Acknowledgments: NSF 1741040, DOE SC0012636},}
@inproceedings{do2020iccs,author={Do, Tu Mai Anh and Pottier, Lo{\"i}c and Thomas, Stephen and {Ferreira da Silva}, Rafael and Cuendet, Michel A. and Weinstein, Harel and Estrada, Trilce and Taufer, Michela and Deelman, Ewa},title={A Novel Metric to Evaluate In Situ Workflows},booktitle={International Conference on Computational Science (ICCS)},year={2020},pages={538--553},keywords={mine,isi},doi={10.1007/978-3-030-50371-0_40},note={Funding Acknowledgments: NSF 1741040},}
@inproceedings{ferreiradasilva2020works,title={WorkflowHub: Community Framework for Enabling Scientific Workflow Research and Development},author={{Ferreira da Silva}, Rafael and Pottier, Lo\"ic and Coleman, Tain\~a and Deelman, Ewa and Casanova, Henri},booktitle={2020 IEEE/ACM Workflows in Support of Large-Scale Science (WORKS)},year={2020},pages={49--56},keywords={mine,workshop,isi},doi={10.1109/WORKS51914.2020.00012},note={Funding Acknowledgments: NSF 2016619, DOE DE-SC0012636, NSF 1664162, NSF 1923539},}
@inproceedings{pottier2020cluster,author={Pottier, Lo{\"i}c and {Ferreira da Silva}, Rafael and Casanova, Henri and Deelman, Ewa},title={Modeling the Performance of Scientific Workflow Executions on HPC Platforms with Burst Buffers},booktitle={2020 IEEE International Conference on Cluster Computing (CLUSTER)},year={2020},pages={92--103},doi={10.1109/CLUSTER49012.2020.00019},keywords={mine,isi},note={Funding Acknowledgments: DOE DE-SC0012636, NSF 1664162, NSF 1741040, NSF 1923539, NSF 1923621},}
With the recent advent of many-core architectures such as chip multiprocessors (CMPs), the number of processing units accessing a global shared memory is constantly increasing. Co-scheduling techniques are used to improve application throughput on such architectures, but sharing resources often generates critical interferences. In this article, we focus on the interferences in the last level of cache (LLC) and use the Cache Allocation Technology (CAT) recently provided by Intel to partition the LLC and give each co-scheduled application their own cache area. We consider m iterative HPC applications running concurrently and answer to the following questions: (i) How to precisely model the behavior of these applications on the cache-partitioned platform? and (ii) how many cores and cache fractions should be assigned to each application to maximize the platform efficiency? Here, platform efficiency is defined as maximizing the performance either globally, or as guaranteeing a fixed ratio of iterations per second for each application. Through extensive experiments using CAT, we demonstrate the impact of cache partitioning when multiple HPC applications are co-scheduled onto CMP platforms.
@article{ijhpca2019cmp,author={Aupy, Guillaume and Benoit, Anne and Goglin, Brice and Pottier, Lo{\"i}c and Robert, Yves},year={2019},month=apr,title={Co-scheduling HPC workloads on cache-partitioned CMP platforms},journal={International Journal of High Performance Computing Applications},pages={0},volume={0},number={0},doi={10.1177/1094342019846956},keywords={mine,ensl},}
@inproceedings{deelman-escience-2019,title={Cyberinfrastructure Center of Excellence Pilot: Connecting Large Facilities Cyberinfrastructure},author={Deelman, Ewa and Mandal, Anirban and Pascucci, Valerio and Sons, Susan and Wyngaard, Jane and Vardeman II, Charles F and Petruzza, Steve and Baldin, Ilya and Christopherson, Laura and Mitchell, Ryan and Pottier, Lo{\"i}c and Rynge, Mats and Scott, Erik and Vahi, Karan and Kogank, Marina and Mann, Jasmine A and Gulbransen, Tom and Allen, Daniel and Barlow, David and Bonarrigo, Santiago and Clark, Chris and Goldman, Leslie and Goulden, Tristan and Harvey, Phil and Hulsander, David and Jacob, Steve and Laney, Christine and Lobo-Padilla, Ivan and Sampson, Jeremey and Staarmann, John and Stone, Steve},booktitle={15th International Conference on eScience (eScience)},year={2019},location={San Diego, CA, USA},pages={},doi={},note={Funding Acknowledgments: NSF 1842042},keywords={mine,isi},}
@inproceedings{mitchell2019btsd,title={Exploration of Workflow Management Systems Emerging Features from Users Perspectives},author={Mitchell, Ryan and Pottier, Lo{\"i}c and Jacobs, Steve and {Ferreira da Silva}, Rafael and Rynge, Mats and Vahi, Karan and Deelman, Ewa},booktitle={First International Workshop on Big Data Tools, Methods, and Use Cases for Innovative Scientific Discovery (BTSD)},year={2019},pages={},doi={},note={Funding Acknowledgments: NSF 1842042},keywords={mine,workshop,isi},}
@inproceedings{thomas-escience-2019,title={Characterization of In Situ and In Transit Analytics of Molecular Dynamics Simulations for Next-generation Supercomputers},author={Thomas, Stephen and Wyatt, Michael and Do, Tu Mai Anh and Pottier, Lo{\"i}c and {Ferreira da Silva}, Rafael and Weinstein, Harel and Cuendet, Michel A. and Estrada, Trilce and Deelman, Ewa and Taufer, Michela},booktitle={15th International Conference on eScience (eScience)},year={2019},pages={188--198},doi={10.1109/eScience.2019.00027},note={Funding Acknowledgments: NSF 1741040},keywords={mine,isi},}
Recently, the benefits of co-scheduling several applications have been demonstrated in a fault-free context, both in terms of performance and energy savings. However, large-scale computer systems are confronted by frequent failures, and resilience techniques must be employed for large applications to execute efficiently. Indeed, failures may create severe imbalance between applications and significantly degrade performance. In this article, we aim at minimizing the expected completion time of a set of co-scheduled applications. We propose to redistribute the resources assigned to each application upon the occurrence of failures, and upon the completion of some applications, in order to achieve this goal. First, we introduce a formal model and establish complexity results. The problem is NP-complete for malleable applications, even in a fault-free context. Therefore, we design polynomial-time heuristics that perform redistributions and account for processor failures. A fault simulator is used to perform extensive simulations that demonstrate the usefulness of redistribution and the performance of the proposed heuristics.
@article{ijhpca2018resilience,author={Benoit, Anne and Pottier, Lo{\"i}c and Robert, Yves},title={Resilient co-scheduling of malleable applications},journal={International Journal of High Performance Computing and Applications},volume={32},number={1},pages={89--103},year={2018},doi={10.1177/1094342017704979},eprint={https://doi.org/10.1177/1094342017704979},keywords={mine,ensl},}
Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are as follows: (i) which proportion of cache and (ii) how many processors should be given to each application? In this article, we provide answers to (i) and (ii) for Amdahl applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for Amdahl applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.
@article{ijhpca2018cache,author={Aupy, Guillaume and Benoit, Anne and Dai, Sicheng and Pottier, Lo{\"i}c and Raghavan, Padma and Robert, Yves and Shantharam, Manu},title={Co-scheduling Amdahl applications on cache-partitioned systems},journal={International Journal of High Performance Computing and Applications},volume={32},number={1},pages={123--138},year={2018},doi={10.1177/1094342017710806},keywords={mine,ensl},}
Co-scheduling techniques are used to improve the throughput of applications on chip multiprocessors (CMP), but sharing resources often generates critical interferences. We focus on the interferences in the last level of cache (LLC) and use the Cache Allocation Technology (CAT) recently provided by Intel to partition the LLC and give each co-scheduled application their own cache area. We consider m iterative HPC applications running concurrently and answer the following questions: (i) how to precisely model the behavior of these applications on the cache partitioned platform? and (ii) how many cores and cache fractions should be assigned to each application to maximize the platform efficiency? Here, platform efficiency is defined as maximizing the performance either globally, or as guaranteeing a fixed ratio of iterations per second for each application. Through extensive experiments using CAT, we demonstrate the impact of cache partitioning when multiple HPC application are co-scheduled onto CMP platforms.
@inproceedings{cluster18,author={Aupy, Guillaume and Benoit, Anne and Goglin, Brice and Pottier, Lo{\"i}c and Robert, Yves},title={Co-Scheduling {HPC} Workloads on Cache-Partitioned {CMP} Platforms},booktitle={{IEEE} International Conference on Cluster Computing, {CLUSTER} 2018,
Belfast, UK, September 10-13, 2018},pages={348--358},publisher={{IEEE} Computer Society},year={2018},doi={10.1109/CLUSTER.2018.00052},keywords={mine,ensl},}
This work presents a realistic performance model to execute scientific workflows on high-bandwidth-memory architectures such as the Intel Knights Landing. We provide a detailed analysis of the execution time on such platforms, taking into account transfers from both fast and slow memory and their overlap with computations. We discuss several scheduling and mapping strategies: not only tasks must be assigned to computing resources, but also one has to decide which fraction of input and output data will reside in fast memory and which will have to stay in slow memory. We use extensive simulations to assess the impact of the mapping strategies on performance. We also conduct experiments for a simple 1D Gauss-Seidel kernel, which assess the accuracy of the model and further demonstrate the importance of a tuned memory management. Our model and results lay the foundations for further studies and experiments on dual-memory systems.
@inproceedings{icpp2018,author={Benoit, Anne and Perarnau, Swann and Pottier, Lo{\"i}c and Robert, Yves},title={A Performance Model to Execute Workflows on High-Bandwidth-Memory
Architectures},booktitle={Proceedings of the 47th International Conference on Parallel Processing,
{ICPP} 2018, Eugene, OR, USA, August 13-16, 2018},pages={36:1--36:10},articleno={36},publisher={{ACM}},address={New York, NY, USA},acmid={3225110},isbn={978-1-4503-6510-9},location={Eugene, OR, USA},year={2018},doi={10.1145/3225058.3225110},keywords={mine,ensl},}
Big data applications play an increasing role in high-performance computing. They are perfect candidates for co-scheduling, as they obey flexible speedup models, alternating I/O operations and intensive computation phases. In this chapter, we discuss co-scheduling on failure-prone platforms. Checkpointing helps to mitigate the impact of a failure on a given application, but it must be complemented by redistributions to rebalance the load among all applications. Co-scheduling usually involves partitioning the applications into packs, and then scheduling each pack in sequence, as efficiently as possible. The objective is therefore to determine a partition into packs, and an assignment of processors to applications, that minimize the sum of the execution times of the packs. On the theoretical side, we assess the problem complexity. On the practical side, we design several polynomial-time heuristics to deal with the general problem with failures and redistribution costs. The proposed heuristics show very good performance while executing in very short time, hence validating the approach.
@incollection{chapter2016crc,author={Aupy, Guillaume and Benoit, Anne and Pottier, Lo{\"i}c and Raghavan, Padma and Robert, Yves and Shantharam, Manu},booktitle={Big Data Management and Processing},publisher={Chapman and Hall/CRC Press},editor={Li, Kuan-Ching and Jiang, Hai and Zomaya, Albert},title={{Co-scheduling high-performance computing applications}},chapter={5},pages={81--104},month=may,year={2017},doi={10.1201/9781315154008-5},keywords={mine,ensl},}
In 2017 IEEE International Parallel and Distributed Processing Symposium
Workshops, IPDPS Workshops 2017, Orlando / Buena Vista, FL, USA,
May 29 - June 2, 2017 May 2017
Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are: (i) which proportion of cache and (ii) how many processors should be given to each application? Here, we assign rational numbers of processors to each application, since they can be shared across applications through multi-threading. In this paper, we provide answers to (i) and (ii) for perfectly parallel applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for general applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.
@inproceedings{apdcm2017,author={Aupy, Guillaume and Benoit, Anne and Pottier, Lo{\"i}c and Raghavan, Padma and Robert, Yves and Shantharam, Manu},title={Co-Scheduling Algorithms for Cache-Partitioned Systems},booktitle={2017 {IEEE} International Parallel and Distributed Processing Symposium
Workshops, {IPDPS} Workshops 2017, Orlando / Buena Vista, FL, USA,
May 29 - June 2, 2017},pages={874--883},publisher={{IEEE} Computer Society},year={2017},month=may,location={Lake Buena Vista, FL, USA},doi={10.1109/IPDPSW.2017.60},issn={},isbn={978-1-5386-3408-0},keywords={mine,workshop,ensl},}
Recently, the benefits of co-scheduling several applications have been demonstrated in a fault-free context, both in terms of performance and energy savings. However, large-scale computer systems are confronted to frequent failures, and resilience techniques must be employed to ensure the completion of large applications. Indeed, failures may create severe imbalance between applications, and significantly degrade performance. In this paper, we propose to redistribute the resources assigned to each application upon the striking of failures, in order to minimize the expected completion time of a set of co-scheduled applications. First, we introduce a formal model and establish complexity results. When no redistribution is allowed, we can minimize the expected completion time in polynomial time, while the problem becomes NP-complete with redistributions, even in a fault-free context. Therefore, we design polynomial-time heuristics that perform redistributions and account for processor failures. A fault simulator is used to perform extensive simulations that demonstrate the usefulness of redistribution and the performance of the proposed heuristics.
@inproceedings{icpp2016,author={Benoit, Anne and Pottier, Lo{\"i}c and Robert, Yves},title={Resilient Application Co-scheduling with Processor Redistribution},booktitle={45th International Conference on Parallel Processing, {ICPP} 2016,
Philadelphia, PA, USA, August 16-19, 2016},pages={123--132},publisher={{IEEE} Computer Society},year={2016},month=aug,location={Philadelphia, PA, USA},doi={10.1109/ICPP.2016.21},issn={2332-5690},isbn={978-1-5090-2823-8},keywords={mine,ensl},}