Publications

@inproceedings{Europar2020,
  author = {Barua, Prithayan
and Zhao, Jisheng
and Sarkar, Vivek},
  editor = {Malawski, Maciej
and Rzadca, Krzysztof},
  title = {OmpMemOpt: Optimized Memory Movement for Heterogeneous Computing},
  booktitle = {Euro-Par 2020: Parallel Processing},
  year = {2020},
  publisher = {Springer International Publishing},
  address = {Cham},
  pages = {200--216},
  abstract = {The fast development of acceleration architectures and applications has made heterogeneous computing the norm for high-performance computing. The cost of high volume data movement to the accelerators is an important bottleneck both in terms of application performance and developer productivity. Memory management is still a manual task performed tediously by expert programmers. In this paper, we develop a compiler analysis to automate memory management for heterogeneous computing. We propose an optimization framework that casts the problem of detection and removal of redundant data movements into a partial redundancy elimination (PRE) problem and applies the lazy code motion technique to optimize these data movements. We chose OpenMP as the underlying parallel programming model and implemented our optimization framework in the LLVM toolchain. We evaluated it with ten benchmarks and obtained a geometric speedup of 2.3{\$}{\$}{\backslash}times {\$}{\$}, and reduced on average 50{\%} of the total bytes transferred between the host and GPU.},
  isbn = {978-3-030-57675-2}
}

@inproceedings{PLDI2020,
  author = {Porter, Chris and Mururu, Girish and Barua, Prithayan and Pande, Santosh},
  title = {BlankIt Library Debloating: Getting What You Want Instead of Cutting What You Don’t},
  year = {2020},
  isbn = {9781450376136},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/3385412.3386017},
  doi = {10.1145/3385412.3386017},
  abstract = {Modern software systems make extensive use of libraries derived from C and C++. Because of the lack of memory safety in these languages, however, the libraries may suffer from vulnerabilities, which can expose the applications to potential attacks. For example, a very large number of return-oriented programming gadgets exist in glibc that allow stitching together semantically valid but malicious Turing-complete and -incomplete programs. While CVEs get discovered and often patched and remedied, such gadgets serve as building blocks of future undiscovered attacks, opening an ever-growing set of possibilities for generating malicious programs. Thus, significant reduction in the quantity and expressiveness (utility) of such gadgets for libraries is an important problem. In this work, we propose a new approach for handling an application’s library functions that focuses on the principle of “getting only what you want.” This is a significant departure from the current approaches that focus on “cutting what is unwanted.” Our approach focuses on activating/deactivating library functions on demand in order to reduce the dynamically linked code surface, so that the possibilities of constructing malicious programs diminishes substantially. The key idea is to load only the set of library functions that will be used at each library call site within the application at runtime. This approach of demand-driven loading relies on an input-aware oracle that predicts a near-exact set of library functions needed at a given call site during the execution. The predicted functions are loaded just in time and unloaded on return. We present a decision-tree based predictor, which acts as an oracle, and an optimized runtime system, which works directly with library binaries like GNU libc and libstdc++. We show that on average, the proposed scheme cuts the exposed code surface of libraries by 97.2%, reduces ROP gadgets present in linked libraries by 97.9%, achieves a prediction accuracy in most cases of at least 97%, and adds a runtime overhead of 18% on all libraries (16% for glibc, 2% for others) across all benchmarks of SPEC 2006. Further, we demonstrate BlankIt on two real-world applications, sshd and nginx, with a high amount of debloating and low overheads.},
  booktitle = {Proceedings of the 41st ACM SIGPLAN Conference on Programming Language Design and Implementation},
  pages = {164–180},
  numpages = {17},
  keywords = {program security, software debloating},
  location = {London, UK},
  series = {PLDI 2020}
}

@inproceedings{10.1007/978-3-030-28596-8_1,
  author = {Barua, Prithayan
and Shirako, Jun
and Tsang, Whitney
and Paudel, Jeeva
and Chen, Wang
and Sarkar, Vivek},
  editor = {Fan, Xing
and de Supinski, Bronis R.
and Sinnen, Oliver
and Giacaman, Nasser},
  title = {OMPSan: Static Verification of OpenMP's Data Mapping Constructs [Best Paper Award]},
  booktitle = {OpenMP: Conquering the Full Hardware Spectrum},
  year = {2019},
  publisher = {Springer International Publishing},
  address = {Cham},
  pages = {3--18},
  abstract = {OpenMP offers directives for offloading computations from CPU hosts to accelerator devices such as GPUs. A key underlying challenge is in efficiently managing the movement of data across the host and the accelerator. User experiences have shown that memory management in OpenMP programs with offloading capabilities is non-trivial and error-prone.},
  isbn = {978-3-030-28596-8}
}

@inproceedings{8735529,
  author = {N. {Srivastava} and H. {Rong} and P. {Barua} and G. {Feng} and H. {Cao} and Z. {Zhang} and D. {Albonesi} and V. {Sarkar} and W. {Chen} and P. {Petersen} and G. {Lowney} and A. {Herr} and C. {Hughes} and T. {Mattson} and P. {Dubey}},
  booktitle = {2019 IEEE 27th Annual International Symposium on Field-Programmable Custom Computing Machines (FCCM)},
  title = {T2S-Tensor: Productively Generating High-Performance Spatial Hardware for Dense Tensor Computations},
  year = {2019},
  volume = {},
  number = {},
  pages = {181-189},
  abstract = {We present a language and compilation framework for productively generating high-performance systolic arrays for dense tensor kernels on spatial architectures, including FPGAs and CGRAs. It decouples a functional specification from a spatial mapping, allowing programmers to quickly explore various spatial optimizations for the same function. The actual implementation of these optimizations is left to a compiler. Thus, productivity and performance are achieved at the same time. We used this framework to implement several important dense tensor kernels. We implemented dense matrix multiply for an Arria-10 FPGA and a research CGRA, achieving 88% and 92% of the performance of manually written, and highly optimized expert (ninja") implementations in just 3% of their engineering time. Three other tensor kernels, including MTTKRP, TTM and TTMc, were also implemented with high performance and low design effort, and for the first time on spatial architectures."},
  keywords = {Optimization;Computer architecture;Kernel;Field programmable gate arrays;Programming;Hardware;Compiler;Domain Specific Language;High Level Synthesis;Spatial Computing;FPGA;CGRA},
  doi = {10.1109/FCCM.2019.00033},
  issn = {2576-2621},
  month = {April}
}

@inproceedings{Barua:2018:CTC:3243176.3243196,
  author = {Barua, Prithayan and Shirako, Jun and Sarkar, Vivek},
  title = {Cost-driven Thread Coarsening for GPU Kernels},
  booktitle = {Proceedings of the 27th International Conference on Parallel Architectures and Compilation Techniques},
  series = {PACT '18},
  year = {2018},
  isbn = {978-1-4503-5986-3},
  location = {Limassol, Cyprus},
  pages = {32:1--32:14},
  articleno = {32},
  numpages = {14},
  url = {http://doi.acm.org/10.1145/3243176.3243196},
  doi = {10.1145/3243176.3243196},
  acmid = {3243196},
  publisher = {ACM},
  address = {New York, NY, USA},
  abstract = {Directive-based programming models like OpenACC provide a higher level abstraction and low overhead approach of porting existing applications to GPGPUs and other heterogeneous HPC hardware. Such programming models increase the design space exploration possible at the compiler level to exploit specific features of different architectures. We observed that traditional applications designed for latency optimized out-of-order pipelined CPUs do not exploit the throughput optimized in-order pipelined GPU architecture efficiently. In this paper we develop a model to estimate the memory throughput of a given application. Then we use the loop interleave transformation to improve the memory bandwidth utilization of a given kernel.

    We developed a heuristic to estimate the optimal loop interleave factor, and implemented it in the OpenARC compiler for OpenACC. We evaluated our approach on over 216 kernels to achieve a Geo-mean speedup of 1.32×.

      Our compiler optimization aims to provide the right balance between performance, portability and productivity.}
}

@misc{1902.06570,
  author = {Girish Mururu and Chris Porter and Prithayan Barua and Santosh Pande},
  title = {Binary Debloating for Security via Demand Driven Loading},
  year = {2019},
  eprint = {arXiv:1902.06570},
  abstract = {Modern software systems heavily use C/C++ based libraries. Because of the weak memory model of C/C++, libraries may suffer from vulnerabilities which can expose the applications to potential attacks. For example, a very large number of return oriented programming gadgets exist in glibc that allow stitching together semantically valid but malicious Turing-complete programs. In spite of significant advances in attack detection and mitigation, full defense is unrealistic against an ever-growing set of possibilities for generating such malicious programs. 
  In this work, we create a defense mechanism by debloating libraries to reduce the dynamic functions linked so that the possibilities of constructing malicious programs diminishes significantly. The key idea is to locate each library call site within an application, and in each case to load only the set of library functions that will be used at that call site. This approach of demand-driven loading relies on an input-aware oracle that predicts a near-exact set of library functions needed at a given call site during the execution. The predicted functions are loaded just in time, and the complete call chain (of function bodies) inside the library is purged after returning from the library call back into the application. We present a decision-tree based predictor, which acts as an oracle, and an optimized runtime system, which works directly with library binaries like GNU libc and libstdc++. We show that on average, the proposed scheme cuts the exposed code surface of libraries by 97.2%, reduces ROP gadgets present in linked libraries by 97.9%, achieves a prediction accuracy in most cases of at least 97%, and adds a small runtime overhead of 18% on all libraries (16% for glibc, 2% for others) across all benchmarks of SPEC 2006, suggesting this scheme is practical.}
}

@inproceedings{10.1007/978-3-642-13365-7_9,
  author = {Giri, Debasis
    and Barua, Prithayan
    and Srivastava, P. D.
    and Jana, Biswapati},
  editor = {Bandyopadhyay, Samir Kumar
    and Adi, Wael
    and Kim, Tai-hoon
    and Xiao, Yang},
  title = {A Cryptosystem for Encryption and Decryption of Long Confidential Messages},
  booktitle = {Information Security and Assurance},
  year = {2010},
  publisher = {Springer Berlin Heidelberg},
  address = {Berlin, Heidelberg},
  pages = {86--96},
  abstract = {In this paper, we propose a cryptosystem which can encrypt and decrypt long (text) messages in efficient manner. The proposed cryptosystem is a combination of symmetric-key and asymmetric-key cryptography, where asymmetric-key cryptography is used to transmit the secret key to an intended receiver and the sender/receiver encrypts/decrypts messages using that secret key. In 2002, Hwang et al. proposed a scheme for encrypting long messages. The main drawback of their scheme is that it requires more computational overhead. Our proposed scheme is more efficient from the computational point of view compared to that of their scheme. Our scheme is a block cipher, long messages are broken into fixed length plaintext blocks for encryption. It supports parallel computation, since encryption/decryption of all the blocks of plaintext/plaintext are independent and thus can be carried out simultaneously. In addition, our scheme retains the same security level as their scheme.},
  isbn = {978-3-642-13365-7}
}