2018 |
Campbell, Roy H ASSURED CLOUD COMPUTING UNIVERSITY CENTER OFEXCELLENCE (ACC UCOE) Technical Report 2018. Abstract | Links | BibTeX | Tags: @techreport{, title = {ASSURED CLOUD COMPUTING UNIVERSITY CENTER OFEXCELLENCE (ACC UCOE)}, author = {Roy H Campbell}, url = {http://www.dtic.mil/dtic/tr/fulltext/u2/1045350.pdf}, year = {2018}, date = {2018-01-01}, abstract = {Security and isolation in cloud environments-Cyber infrastructure security-Design of algorithms and techniques for real-time assuredness in cloud computing-Map-reduce task assignment with data locality constraint-Trustworthiness estimation for workflow completion- Application-aware cloud network resource allocation}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } Security and isolation in cloud environments-Cyber infrastructure security-Design of algorithms and techniques for real-time assuredness in cloud computing-Map-reduce task assignment with data locality constraint-Trustworthiness estimation for workflow completion- Application-aware cloud network resource allocation |
2017 |
Noghabi, Shadi A; Paramasivam, Kartik; Pan, Yi; Ramesh, Navina; Bringhurst, Jon; Gupta, Indranil; Campbell, Roy H Samza: Stateful Scalable Stream Processing at LinkedIn Miscellaneous 2017. @misc{, title = {Samza: Stateful Scalable Stream Processing at LinkedIn}, author = {Shadi A Noghabi and Kartik Paramasivam and Yi Pan and Navina Ramesh and Jon Bringhurst and Indranil Gupta and Roy H Campbell}, year = {2017}, date = {2017-08-01}, volume = {10}, number = {12}, pages = {1634-1645}, publisher = {VLDB Endowment}, abstract = {Distributed stream processing systems need to support stateful processing, recover quickly from failures to resume such processing, and reprocess an entire data stream quickly. We present Apache Samza, a distributed system for stateful and fault-tolerant stream processing. Samza utilizes a partitioned local state along with a low-overhead background changelog mechanism, allowing it to scale to massive state sizes (hundreds of TB) per application. Recovery from failures is sped up by re-scheduling based on Host Affinity. In addition to processing infinite streams of events, Samza supports processing a finite dataset as a stream, from either a streaming source (e.g., Kafka), a database snapshot (e.g., Databus), or a file system (e.g. HDFS), without having to change the application code (unlike the popular Lambdabased architectures which necessitate maintenance of separate code bases for batch and stream path processing). Samza is currently in use at LinkedIn by hundreds of production applications with more than 10, 000 containers. Samza is an open-source Apache project adopted by many top-tier companies (e.g., LinkedIn, Uber, Netflix, TripAdvisor, etc.). Our experiments show that Samza: a) handles state efficiently, improving latency and throughput by more than 100× compared to using a remote storage; b) provides recovery time independent of state size; c) scales performance linearly with number of containers; and d) supports reprocessing of the data stream quickly and with minimal interference on real-time traffic.}, keywords = {}, pubstate = {published}, tppubtype = {misc} } Distributed stream processing systems need to support stateful processing, recover quickly from failures to resume such processing, and reprocess an entire data stream quickly. We present Apache Samza, a distributed system for stateful and fault-tolerant stream processing. Samza utilizes a partitioned local state along with a low-overhead background changelog mechanism, allowing it to scale to massive state sizes (hundreds of TB) per application. Recovery from failures is sped up by re-scheduling based on Host Affinity. In addition to processing infinite streams of events, Samza supports processing a finite dataset as a stream, from either a streaming source (e.g., Kafka), a database snapshot (e.g., Databus), or a file system (e.g. HDFS), without having to change the application code (unlike the popular Lambdabased architectures which necessitate maintenance of separate code bases for batch and stream path processing). Samza is currently in use at LinkedIn by hundreds of production applications with more than 10, 000 containers. Samza is an open-source Apache project adopted by many top-tier companies (e.g., LinkedIn, Uber, Netflix, TripAdvisor, etc.). Our experiments show that Samza: a) handles state efficiently, improving latency and throughput by more than 100× compared to using a remote storage; b) provides recovery time independent of state size; c) scales performance linearly with number of containers; and d) supports reprocessing of the data stream quickly and with minimal interference on real-time traffic. |
Palmer, Imani; Gelfand, Boris; Campbell, Roy Exploring Digital Evidence with Graph Theory Miscellaneous 2017. Abstract | Links | BibTeX | Tags: @misc{, title = {Exploring Digital Evidence with Graph Theory}, author = {Imani Palmer and Boris Gelfand and Roy Campbell}, url = {https://commons.erau.edu/adfsl/2017/papers/9}, year = {2017}, date = {2017-05-01}, pages = {197-206}, abstract = {The analysis phase of the digital forensic process is the most complex. This phase grows more complicated as the size and ubiquity of digital devices increase. There are many tools aimed at assisting the investigator in the analysis process; however, they do not address growing challenges. In this paper, we discuss the application of graph theory, a study of related mathematical structures, to aid in the investigation process of digital forensic examiners. Graph theory is used to study the pairwise relations between objects. We explore how graph theory can be used as a basis for further analysis. We demonstrate the potential of the application of graph theory through its implementation in a case study}, keywords = {}, pubstate = {published}, tppubtype = {misc} } The analysis phase of the digital forensic process is the most complex. This phase grows more complicated as the size and ubiquity of digital devices increase. There are many tools aimed at assisting the investigator in the analysis process; however, they do not address growing challenges. In this paper, we discuss the application of graph theory, a study of related mathematical structures, to aid in the investigation process of digital forensic examiners. Graph theory is used to study the pairwise relations between objects. We explore how graph theory can be used as a basis for further analysis. We demonstrate the potential of the application of graph theory through its implementation in a case study |
Giulio, Carlo Di; Sprabery, Read; Kamhoua, Charles; Kwiat, Kevin; Campbell, Roy H; Bashir, Masooda N Cloud Security Certifications: A Comparison to Improve Cloud Service Provider Security Miscellaneous 2017. BibTeX | Tags: @misc{, title = {Cloud Security Certifications: A Comparison to Improve Cloud Service Provider Security}, author = {Carlo Di Giulio and Read Sprabery and Charles Kamhoua and Kevin Kwiat and Roy H Campbell and Masooda N Bashir}, year = {2017}, date = {2017-03-01}, publisher = {IEEE}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Sprabery, Read; Evchenko, Konstantin; Raj, Abhilash; Bobba, Rakesh B; Mohan, Sibin; Campbell, Roy H A Novel Scheduling Framework Leveraging Hardware Cache Partitioning for Cache-Side-Channel Elimination in Clouds Journal Article arXiv preprint arXiv:1708.09538, 2017. @article{, title = {A Novel Scheduling Framework Leveraging Hardware Cache Partitioning for Cache-Side-Channel Elimination in Clouds}, author = {Read Sprabery and Konstantin Evchenko and Abhilash Raj and Rakesh B Bobba and Sibin Mohan and Roy H Campbell}, year = {2017}, date = {2017-01-01}, journal = {arXiv preprint arXiv:1708.09538}, abstract = {While there exist many isolation mechanisms that are available to cloud service providers, including virtual machines, containers, etc. , the problem of side-channel increases in importance as a remaining security vulnerability – particularly in the presence of shared caches and multicore processors. In this paper we present a hardware-software mechanism that improves the isolation of cloud processes in the presence of shared caches on multicore chips. Combining the Intel CAT architecture that enables cache partitioning on the !y with novel scheduling techniques and state cleansing mechanisms, we enable cache-side-channel free computing for Linux-based containers and virtual machines, in particular, those managed by KVM. We do a preliminary evaluation of our system using a CPU bound workload. Our system allows Simultaneous Multithreading (SMT) to remain enabled and does not require application level changes.}, keywords = {}, pubstate = {published}, tppubtype = {article} } While there exist many isolation mechanisms that are available to cloud service providers, including virtual machines, containers, etc. , the problem of side-channel increases in importance as a remaining security vulnerability – particularly in the presence of shared caches and multicore processors. In this paper we present a hardware-software mechanism that improves the isolation of cloud processes in the presence of shared caches on multicore chips. Combining the Intel CAT architecture that enables cache partitioning on the !y with novel scheduling techniques and state cleansing mechanisms, we enable cache-side-channel free computing for Linux-based containers and virtual machines, in particular, those managed by KVM. We do a preliminary evaluation of our system using a CPU bound workload. Our system allows Simultaneous Multithreading (SMT) to remain enabled and does not require application level changes. |
Ramachandran, Prajit; Paine, Tom Le; Khorrami, Pooya; Babaeizadeh, Mohammad; Chang, Yang Zhang Shiyu; Hasegawa-Johnson, Mark A; Campbell, Roy H; Huang, Thomas S FAST GENERATION FOR CONVOLUTIONAL AUTOREGRESSIVE MODELS Journal Article arXiv preprint arXiv:1704.06001, 2017. @article{, title = {FAST GENERATION FOR CONVOLUTIONAL AUTOREGRESSIVE MODELS}, author = {Prajit Ramachandran and Tom Le Paine and Pooya Khorrami and Mohammad Babaeizadeh and Yang Zhang Shiyu Chang and Mark A Hasegawa-Johnson and Roy H Campbell and Thomas S Huang}, year = {2017}, date = {2017-01-01}, journal = {arXiv preprint arXiv:1704.06001}, abstract = {Convolutional autoregressive models have recently demonstrated state-of-the-art performance on a number of generation tasks. While fast, parallel training methods have been crucial for their success, generation is typically implemented in a na¨ıve fashion where redundant computations are unnecessarily repeated. This results in slow generation, making such models infeasible for production environments. In this work, we describe a method to speed up generation in convolutional autoregressive models. The key idea is to cache hidden states to avoid redundant computation. We apply our fast generation method to the Wavenet and PixelCNN++ models and achieve up to 21× and 183× speedups respectively}, keywords = {}, pubstate = {published}, tppubtype = {article} } Convolutional autoregressive models have recently demonstrated state-of-the-art performance on a number of generation tasks. While fast, parallel training methods have been crucial for their success, generation is typically implemented in a na¨ıve fashion where redundant computations are unnecessarily repeated. This results in slow generation, making such models infeasible for production environments. In this work, we describe a method to speed up generation in convolutional autoregressive models. The key idea is to cache hidden states to avoid redundant computation. We apply our fast generation method to the Wavenet and PixelCNN++ models and achieve up to 21× and 183× speedups respectively |
Hashemi, Sayed Hadi; Faghri, Faraz; Campbell, Roy H Decentralized User-Centric Access Control using PubSub over Blockchain Conference 2017. @conference{, title = {Decentralized User-Centric Access Control using PubSub over Blockchain}, author = {Sayed Hadi Hashemi and Faraz Faghri and Roy H Campbell}, year = {2017}, date = {2017-01-01}, journal = {arXiv preprint arXiv:1710.00110}, abstract = {We present a mechanism that puts users in the center of control and empowers them to dictate the access to their collections of data. Revisiting the fundamental mechanisms in security for providing protection, our solution uses capabilities, access lists, and access rights following well-understood formal notions for reasoning about access. This contribution presents a practical, correct, auditable, transparent, distributed, and decentralized mechanism that is well-matched to the current emerging environments including Internet of Things, smart city, precision medicine, and autonomous cars. It is based on well-tested principles and practices used in a distributed authorization, cryptocurrencies, and scalable computing. Subjects: Cryptography and Security (cs. CR)}, keywords = {}, pubstate = {published}, tppubtype = {conference} } We present a mechanism that puts users in the center of control and empowers them to dictate the access to their collections of data. Revisiting the fundamental mechanisms in security for providing protection, our solution uses capabilities, access lists, and access rights following well-understood formal notions for reasoning about access. This contribution presents a practical, correct, auditable, transparent, distributed, and decentralized mechanism that is well-matched to the current emerging environments including Internet of Things, smart city, precision medicine, and autonomous cars. It is based on well-tested principles and practices used in a distributed authorization, cryptocurrencies, and scalable computing. Subjects: Cryptography and Security (cs. CR) |
Faghri, Faraz; Hashemi, Sayed Hadi; Babaeizadeh, Mohammad; Nalls, Mike A; Sinha, Saurabh; Campbell, Roy H Toward Scalable Machine Learning and Data Mining: the Bioinformatics Case Journal Article arXiv preprint arXiv:1710.00112, 2017. @article{, title = {Toward Scalable Machine Learning and Data Mining: the Bioinformatics Case}, author = {Faraz Faghri and Sayed Hadi Hashemi and Mohammad Babaeizadeh and Mike A Nalls and Saurabh Sinha and Roy H Campbell}, year = {2017}, date = {2017-01-01}, journal = {arXiv preprint arXiv:1710.00112}, abstract = {In an effort to overcome the data deluge in computational biology and bioinformatics and to facilitate bioinformatics research in the era of big data, we identify some of the most influential algorithms that have been widely used in the bioinformatics community. These top data mining and machine learning algorithms cover classification, clustering, regression, graphical model-based learning, and dimensionality reduction. The goal of this study is to guide the focus of scalable computing experts in the endeavor of applying new storage and scalable computation designs to bioinformatics algorithms that merit their attention most, following the engineering maxim of" optimize the common case". Subjects: Distributed, Parallel, and Cluster Computing (cs. DC); Learning (cs. LG); Machine Learning (stat. ML)}, keywords = {}, pubstate = {published}, tppubtype = {article} } In an effort to overcome the data deluge in computational biology and bioinformatics and to facilitate bioinformatics research in the era of big data, we identify some of the most influential algorithms that have been widely used in the bioinformatics community. These top data mining and machine learning algorithms cover classification, clustering, regression, graphical model-based learning, and dimensionality reduction. The goal of this study is to guide the focus of scalable computing experts in the endeavor of applying new storage and scalable computation designs to bioinformatics algorithms that merit their attention most, following the engineering maxim of" optimize the common case". Subjects: Distributed, Parallel, and Cluster Computing (cs. DC); Learning (cs. LG); Machine Learning (stat. ML) |
Blauwendraat, Cornelis; Faghri, Faraz; Geiger, Joshua T; Nalls, Mike A; Nicolas, Aude; Abramzon, Yevgeniya; Murphy, Natalie A; J. Raphael Gibbs, ; Ryten, Mina; Ferrari, Raffaele; Houlden, Henry; Pihlstrom, Lasse; Williams, Julie; Morris, Huw R; Lubbe, Steven; Hernandez, Dena G; Mok, Kin Y; Bras, Jose; Guerreiro, Rita; Campbell, Roy H; Traynor, Bryan J; Chia, Ruth; Simón-Sánchez, Javier; Consortium, COURAGE-PD; Robak, Laurie; Shulman, Joshua; Rogaeva, Ekaterina; Hardy, John A; Singleton, Andrew B; Scholz, Sonja W "NeuroChip, an updated version of the NeuroX genotyping platform to rapidly screen for variants associated with neurological diseases," Journal Article Neurobiology of Aging, Elsevier, 2017. @article{, title = {"NeuroChip, an updated version of the NeuroX genotyping platform to rapidly screen for variants associated with neurological diseases,"}, author = {Cornelis Blauwendraat and Faraz Faghri and Joshua T Geiger and Mike A Nalls and Aude Nicolas and Yevgeniya Abramzon and Natalie A Murphy and J. Raphael Gibbs and Mina Ryten and Raffaele Ferrari and Henry Houlden and Lasse Pihlstrom and Julie Williams and Huw R Morris and Steven Lubbe and Dena G Hernandez and Kin Y Mok and Jose Bras and Rita Guerreiro and Roy H Campbell and Bryan J Traynor and Ruth Chia and Javier Simón-Sánchez and COURAGE-PD Consortium and Laurie Robak and Joshua Shulman and Ekaterina Rogaeva and John A Hardy and Andrew B Singleton and Sonja W Scholz}, year = {2017}, date = {2017-01-01}, journal = {Neurobiology of Aging, Elsevier}, abstract = {Genetics has proven to be a powerful approach in neurodegenerative diseases research, resulting in the identification of numerous causal and risk variants. Previously, we introduced the NeuroX Illumina genotyping array, a fast and efficient genotyping platform designed for the investigation of genetic variation in neurodegenerative diseases. Here, we present its updated version, named NeuroChip. The NeuroChip is a low-cost, custom-designed array containing a tagging variant backbone of about 306,670 variants complemented with a manually curated custom content comprised of 179,467 variants implicated in diverse neurological diseases, including Alzheimer's disease, Parkinson's disease, Lewy body dementia, amyotrophic lateral sclerosis, frontotemporal dementia, progressive supranuclear palsy, corticobasal degeneration, and multiple system atrophy. The tagging backbone was chosen because of the low cost and good genome-wide resolution; the custom content can be combined with other backbones, like population or drug development arrays. Using the NeuroChip, we can accurately identify rare variants and impute over 5.3 million common SNPs from the latest release of the Haplotype Reference Consortium. In summary, we describe the design and usage of the NeuroChip array and show its capability for detecting rare pathogenic variants in numerous neurodegenerative diseases. The NeuroChip has a more comprehensive and improved content, which makes it a reliable, high-throughput, cost-effective screening tool for genetic research and molecular diagnostics in neurodegenerative diseases.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Genetics has proven to be a powerful approach in neurodegenerative diseases research, resulting in the identification of numerous causal and risk variants. Previously, we introduced the NeuroX Illumina genotyping array, a fast and efficient genotyping platform designed for the investigation of genetic variation in neurodegenerative diseases. Here, we present its updated version, named NeuroChip. The NeuroChip is a low-cost, custom-designed array containing a tagging variant backbone of about 306,670 variants complemented with a manually curated custom content comprised of 179,467 variants implicated in diverse neurological diseases, including Alzheimer's disease, Parkinson's disease, Lewy body dementia, amyotrophic lateral sclerosis, frontotemporal dementia, progressive supranuclear palsy, corticobasal degeneration, and multiple system atrophy. The tagging backbone was chosen because of the low cost and good genome-wide resolution; the custom content can be combined with other backbones, like population or drug development arrays. Using the NeuroChip, we can accurately identify rare variants and impute over 5.3 million common SNPs from the latest release of the Haplotype Reference Consortium. In summary, we describe the design and usage of the NeuroChip array and show its capability for detecting rare pathogenic variants in numerous neurodegenerative diseases. The NeuroChip has a more comprehensive and improved content, which makes it a reliable, high-throughput, cost-effective screening tool for genetic research and molecular diagnostics in neurodegenerative diseases. |
Babaeizadeh, Mohammad; Finn, Chelsea; Erhan, Dumitru; Campbell, Roy H; Levine, Sergey Stochastic Variational Video Prediction Conference 2017. Abstract | Links | BibTeX | Tags: @conference{, title = {Stochastic Variational Video Prediction}, author = {Mohammad Babaeizadeh and Chelsea Finn and Dumitru Erhan and Roy H Campbell and Sergey Levine}, doi = {arXiv:1710.11252}, year = {2017}, date = {2017-01-01}, journal = {arXiv preprint arXiv:1710.11252}, abstract = {Predicting the future in real-world settings, particularly from raw sensory observations such as images, is exceptionally challenging. Real-world events can be stochastic and unpredictable, and the high dimensionality and complexity of natural images require the predictive model to build an intricate understanding of the natural world. Many existing methods tackle this problem by making simplifying assumptions about the environment. One common assumption is that the outcome is deterministic and there is only one plausible future. This can lead to low-quality predictions in real-world settings with stochastic dynamics. In this paper, we develop a stochastic variational video prediction (SV2P) method that predicts a different possible future for each sample of its latent variables. To the best of our knowledge, our model is the first to provide effective stochastic multi-frame prediction for real-world videos. We demonstrate the capability of the proposed method in predicting detailed future frames of videos on multiple real-world datasets, both action-free and action-conditioned. We find that our proposed method produces substantially improved video predictions when compared to the same model without stochasticity, and to other stochastic video prediction methods. Our SV2P implementation will be open sourced upon publication.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Predicting the future in real-world settings, particularly from raw sensory observations such as images, is exceptionally challenging. Real-world events can be stochastic and unpredictable, and the high dimensionality and complexity of natural images require the predictive model to build an intricate understanding of the natural world. Many existing methods tackle this problem by making simplifying assumptions about the environment. One common assumption is that the outcome is deterministic and there is only one plausible future. This can lead to low-quality predictions in real-world settings with stochastic dynamics. In this paper, we develop a stochastic variational video prediction (SV2P) method that predicts a different possible future for each sample of its latent variables. To the best of our knowledge, our model is the first to provide effective stochastic multi-frame prediction for real-world videos. We demonstrate the capability of the proposed method in predicting detailed future frames of videos on multiple real-world datasets, both action-free and action-conditioned. We find that our proposed method produces substantially improved video predictions when compared to the same model without stochasticity, and to other stochastic video prediction methods. Our SV2P implementation will be open sourced upon publication. |
2016 |
Campbell, Chris Cai Franck Le Xin Sun Geoffrey Xie Hanii Jamjoom Roy X H CRONets: Cloud-Routed Overlay Networks Conference I36th IEEE International Conference on Distributed Computer Systems (ICDCS 2016), IEEE, Nara, Japan, 2016. Abstract | BibTeX | Tags: networks overlays cloud routing @conference{Cai2016, title = {CRONets: Cloud-Routed Overlay Networks}, author = {Chris X. Cai Franck Le Xin Sun Geoffrey Xie Hanii Jamjoom Roy H. Campbell}, year = {2016}, date = {2016-06-27}, booktitle = {I36th IEEE International Conference on Distributed Computer Systems (ICDCS 2016)}, pages = {12}, publisher = {IEEE}, address = {Nara, Japan}, abstract = {Overlay networking and ISP-assisted tunneling are effective solutions to overcome problematic BGP routes and bypass troublesome autonomous systems. Despite their demonstrated effectiveness, overlay support is not broadly available. In this paper, we propose Cloud-Routed Overlay Networks (CRONets), whereby users can readily build their own overlays using nodes from global and well-provisioned cloud providers like IBM Softlayer or Amazon EC2. While previous studies have demonstrated the benefits of overlay networks with the high-speed experimental Internet2 backbone, we are the first to evaluate the improvements in a realistic—cloud—setting. We conduct a large-scale experiment where we observe 6,600 Internet paths. The results show that CRONets improve the throughput for 78% of the default Internet paths with a median improvement factor of 3.26 times, at a tenth of the cost of leasing private lines of comparable performance. We also performed a longitudinal measurement, and demonstrate that the performance gains are consistent over time with only a small number of overlay nodes needed to be deployed. However, given the size and dynamic nature of the Internet routing system (e.g., due to congestion and failures), selecting the proper path is still a challenging problem. To address it, we propose a novel solution based on the newly introduced MPTCP extensions. Our experiments show that MPTCP can achieve the maximum observed throughput across the different overlay paths. Yet, for some instances, MPTCP performs up to 38% lower than expected indicating room to improve the MPTCP congestion control algorithms.}, keywords = {networks overlays cloud routing}, pubstate = {published}, tppubtype = {conference} } Overlay networking and ISP-assisted tunneling are effective solutions to overcome problematic BGP routes and bypass troublesome autonomous systems. Despite their demonstrated effectiveness, overlay support is not broadly available. In this paper, we propose Cloud-Routed Overlay Networks (CRONets), whereby users can readily build their own overlays using nodes from global and well-provisioned cloud providers like IBM Softlayer or Amazon EC2. While previous studies have demonstrated the benefits of overlay networks with the high-speed experimental Internet2 backbone, we are the first to evaluate the improvements in a realistic—cloud—setting. We conduct a large-scale experiment where we observe 6,600 Internet paths. The results show that CRONets improve the throughput for 78% of the default Internet paths with a median improvement factor of 3.26 times, at a tenth of the cost of leasing private lines of comparable performance. We also performed a longitudinal measurement, and demonstrate that the performance gains are consistent over time with only a small number of overlay nodes needed to be deployed. However, given the size and dynamic nature of the Internet routing system (e.g., due to congestion and failures), selecting the proper path is still a challenging problem. To address it, we propose a novel solution based on the newly introduced MPTCP extensions. Our experiments show that MPTCP can achieve the maximum observed throughput across the different overlay paths. Yet, for some instances, MPTCP performs up to 38% lower than expected indicating room to improve the MPTCP congestion control algorithms. |
Campbell, Shadi Noghabi Sriram Subramanian Priyesh Narayanan Sivabalan Narayanan Gopalakrishna Holla Mammad Zadeh Tianwei Li Indranil Gupta Roy A H Ambry: LinkedIn's Scalable Geo-distributed Object Store Conference ACM SIGMOD/PODS Conference, ACM, San Francisco, USA, 2016. Abstract | BibTeX | Tags: geo-distributed object store @conference{Shadi2016, title = {Ambry: LinkedIn's Scalable Geo-distributed Object Store}, author = {Shadi A. Noghabi Sriram Subramanian Priyesh Narayanan Sivabalan Narayanan Gopalakrishna Holla Mammad Zadeh Tianwei Li Indranil Gupta Roy H. Campbell}, year = {2016}, date = {2016-06-26}, booktitle = {ACM SIGMOD/PODS Conference}, publisher = {ACM}, address = {San Francisco, USA}, abstract = {The infrastructure of a globe-wide social network has to serve millions of users' massive media objects such as photos, videos, and audio continually. Creation and reading of this data must be supported with low latency, high through-put, multiple geo-distributed datacenters, and in a scalable and load-balanced way. Existing file systems and blob stores face a challenge when serving such large objects. We present Ambry, a new blob store that caters to this demand. Ambry is a production-quality system for storing large immutable data (called blobs). Ambry leverages techniques such as a decentralized design, logical blob grouping, asynchronous replication, rebalancing mechanisms, and OS caching. Ambry has been running in a production environment for 24 months, serving up to 10K requests per second across more than 400 million users. Our experimental evaluation reveals that Ambry offers high-efficiency (it utilizes up to 88% of the network bandwidth), responsiveness with low latency, and load balancing (improves imbalance of request rates and disk usage among the cluster by 8x-10x).}, keywords = {geo-distributed object store}, pubstate = {published}, tppubtype = {conference} } The infrastructure of a globe-wide social network has to serve millions of users' massive media objects such as photos, videos, and audio continually. Creation and reading of this data must be supported with low latency, high through-put, multiple geo-distributed datacenters, and in a scalable and load-balanced way. Existing file systems and blob stores face a challenge when serving such large objects. We present Ambry, a new blob store that caters to this demand. Ambry is a production-quality system for storing large immutable data (called blobs). Ambry leverages techniques such as a decentralized design, logical blob grouping, asynchronous replication, rebalancing mechanisms, and OS caching. Ambry has been running in a production environment for 24 months, serving up to 10K requests per second across more than 400 million users. Our experimental evaluation reveals that Ambry offers high-efficiency (it utilizes up to 88% of the network bandwidth), responsiveness with low latency, and load balancing (improves imbalance of request rates and disk usage among the cluster by 8x-10x). |
Noghabi, Shadi A; Subramanian, Sriram; Narayanan, Priyesh; Narayanan, Sivabalan; Holla, Gopalakrishna; Zadeh, Mammad; Li, Tianwei; Gupta, Indranil; Campbell, Roy H Ambry: LinkedIn's Scalable Geo-distributed Object Store Miscellaneous 2016. BibTeX | Tags: @misc{, title = {Ambry: LinkedIn's Scalable Geo-distributed Object Store}, author = {Shadi A Noghabi and Sriram Subramanian and Priyesh Narayanan and Sivabalan Narayanan and Gopalakrishna Holla and Mammad Zadeh and Tianwei Li and Indranil Gupta and Roy H Campbell}, year = {2016}, date = {2016-06-01}, publisher = {ACM}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Campbell, Sayed Hadi Hashemi Faraz Faghri Paul Rausch Roy H World of Empowered IoT Users Conference The 1st IEEE International Conference on Internet-of-Things Design and Implementation, IEEE, Berlin, Germany, 2016. Abstract | BibTeX | Tags: IoT Privacy Blockchains @conference{Hadi20016, title = {World of Empowered IoT Users}, author = {Sayed Hadi Hashemi Faraz Faghri Paul Rausch Roy H. Campbell}, year = {2016}, date = {2016-04-05}, booktitle = {The 1st IEEE International Conference on Internet-of-Things Design and Implementation}, publisher = {IEEE}, address = {Berlin, Germany}, abstract = {In a world deploying an Internet of Things, sensors and actuators are owned, accessed, and activated by a plethora of individuals and organizations. Access to the data produced by this world can both be beneficial and have drawbacks to society. This data potentially represents the activities of millions of individuals and their possessions collected by billions of things and aggregations of this data can be analyzed through the Internet and Clouds. This raises possible privacy, security, moral and ethical challenges whose solutions will require flexible protection mechanisms. How do we acquire and distribute data at the IoT world scale while retaining the rights of individuals and organizations to protect, use, and share their data? Clearly a well-defined mechanism and control needs to regulate access to the data and its aggregations. Our paper describes a user-centric multi-level multiple granularity mechanism to share the data from these devices to people and organizations. Revisiting the fundamental mechanisms in security for providing protection, our solution uses capabilities, access lists, and access rights following well-understood formal notions for reasoning about access. Our contribution is to describe an auditable, transparent, distributed, decentralized, publication-subscription based, robust mechanism and automation of these ideas in the IoT realm that is well-matched to the current generation of clouds. It is based on well-tested principles and practices used in crypto currencies exploiting block chains of transactions. The scheme puts users (including organizational entities) in the center of control over the access to their collections of sensory data. In our paper, we describe a deployment of these ideas for medical health devices, smart cities, and transportation.}, keywords = {IoT Privacy Blockchains}, pubstate = {published}, tppubtype = {conference} } In a world deploying an Internet of Things, sensors and actuators are owned, accessed, and activated by a plethora of individuals and organizations. Access to the data produced by this world can both be beneficial and have drawbacks to society. This data potentially represents the activities of millions of individuals and their possessions collected by billions of things and aggregations of this data can be analyzed through the Internet and Clouds. This raises possible privacy, security, moral and ethical challenges whose solutions will require flexible protection mechanisms. How do we acquire and distribute data at the IoT world scale while retaining the rights of individuals and organizations to protect, use, and share their data? Clearly a well-defined mechanism and control needs to regulate access to the data and its aggregations. Our paper describes a user-centric multi-level multiple granularity mechanism to share the data from these devices to people and organizations. Revisiting the fundamental mechanisms in security for providing protection, our solution uses capabilities, access lists, and access rights following well-understood formal notions for reasoning about access. Our contribution is to describe an auditable, transparent, distributed, decentralized, publication-subscription based, robust mechanism and automation of these ideas in the IoT realm that is well-matched to the current generation of clouds. It is based on well-tested principles and practices used in crypto currencies exploiting block chains of transactions. The scheme puts users (including organizational entities) in the center of control over the access to their collections of sensory data. In our paper, we describe a deployment of these ideas for medical health devices, smart cities, and transportation. |
Campbell, Mayank Pundir Manoj Kumar Luke Leslie Indranil Gupta Roy M H Supporting On-demand Elasticity in Distributed Graph Processing Conference IEEE International Conference on Cloud Engineering (IC2E), 2016. Abstract | BibTeX | Tags: graph processing elasticity @conference{Pundir2016b, title = {Supporting On-demand Elasticity in Distributed Graph Processing}, author = {Mayank Pundir Manoj Kumar Luke M. Leslie Indranil Gupta Roy H. Campbell}, year = {2016}, date = {2016-04-05}, booktitle = {IEEE International Conference on Cloud Engineering (IC2E)}, journal = {IEEE International Conference on Cloud Engineering (IC2E)}, abstract = {While distributed graph processing engines have become popular for processing large graphs, these engines are typically configured with a static set of servers in the cluster. In other words, they lack the flexibility to scale-out or scalein the number of servers, when requested to do so by the user. In this paper, we propose the first techniques to make distributed graph processing truly elastic. While supporting ondemand scale-out/in operations, we meet three goals: i) perform scale-out/in without interrupting the graph computation, ii) minimize the background network overhead involved in the scaleout/in, and iii) mitigate stragglers by maintaining load balance across servers. We present and analyze two techniques called Contiguous Vertex Repartitioning (CVR) and Ring-based Vertex Repartitioning (RVR) to address these goals. We implement our techniques in the LFGraph distributed graph processing system, and incorporate several systems optimizations. Experiments performed with multiple graph benchmark applications on a real graph indicate that our techniques perform within 9% and 21% of the optimum for scale-out and scale-in operations, respectively.}, keywords = {graph processing elasticity}, pubstate = {published}, tppubtype = {conference} } While distributed graph processing engines have become popular for processing large graphs, these engines are typically configured with a static set of servers in the cluster. In other words, they lack the flexibility to scale-out or scalein the number of servers, when requested to do so by the user. In this paper, we propose the first techniques to make distributed graph processing truly elastic. While supporting ondemand scale-out/in operations, we meet three goals: i) perform scale-out/in without interrupting the graph computation, ii) minimize the background network overhead involved in the scaleout/in, and iii) mitigate stragglers by maintaining load balance across servers. We present and analyze two techniques called Contiguous Vertex Repartitioning (CVR) and Ring-based Vertex Repartitioning (RVR) to address these goals. We implement our techniques in the LFGraph distributed graph processing system, and incorporate several systems optimizations. Experiments performed with multiple graph benchmark applications on a real graph indicate that our techniques perform within 9% and 21% of the optimum for scale-out and scale-in operations, respectively. |
Le, Chris Cai Shayan Saeed Indranil Gupta Roy Campbell Franck X H Phurti: Application and Network-Aware Flow Scheduling for Multi-Tenant MapReduce Clusters Conference IEEE International Conference on Cloud Engineering (IC2E), Berlin, Germany, 2016. BibTeX | Tags: network aware flow scheduling @conference{Cai22016, title = {Phurti: Application and Network-Aware Flow Scheduling for Multi-Tenant MapReduce Clusters}, author = {Chris X. Cai Shayan Saeed Indranil Gupta Roy H. Campbell Franck Le}, year = {2016}, date = {2016-04-05}, booktitle = {IEEE International Conference on Cloud Engineering (IC2E)}, address = {Berlin, Germany}, keywords = {network aware flow scheduling}, pubstate = {published}, tppubtype = {conference} } |
Pundir, Mayank; Kumar, Manoj; Leslie, Luke M; Gupta, Indranil; Campbell, Roy H Supporting On-demand Elasticity in Distributed Graph Processing Miscellaneous 2016. BibTeX | Tags: @misc{, title = {Supporting On-demand Elasticity in Distributed Graph Processing}, author = {Mayank Pundir and Manoj Kumar and Luke M Leslie and Indranil Gupta and Roy H Campbell}, year = {2016}, date = {2016-04-01}, pages = {12-21}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Hashemi, Sayed Hadi; Faghri, Faraz; Rausch, Paul; Campbell, Roy H World of Empowered IoT Users Miscellaneous 2016. BibTeX | Tags: @misc{, title = {World of Empowered IoT Users}, author = {Sayed Hadi Hashemi and Faraz Faghri and Paul Rausch and Roy H Campbell}, year = {2016}, date = {2016-04-01}, pages = {13-24}, publisher = {IEEE}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Campbell, Sayed Hadi Hashemi Shadi Noghabi John Bellessa Roy A H Toward Fabric: A Middleware Implementing High-level Description Languages on a Fabric-like Network Conference ANCS '16 Proceedings of the 2016 Symposium on Architectures for Networking and Communications Systems, ACM ACM, New York, USA, 2016, ISBN: 978-1-4503-4183-7. Abstract | Links | BibTeX | Tags: Software Defined Networks Routing @conference{Hadi32016, title = {Toward Fabric: A Middleware Implementing High-level Description Languages on a Fabric-like Network}, author = {Sayed Hadi Hashemi Shadi A. Noghabi John Bellessa Roy H. Campbell}, doi = {10.1145/2881025.2889487}, isbn = {978-1-4503-4183-7}, year = {2016}, date = {2016-03-17}, booktitle = {ANCS '16 Proceedings of the 2016 Symposium on Architectures for Networking and Communications Systems}, pages = {117-118}, publisher = {ACM}, address = {New York, USA}, organization = {ACM}, abstract = {Many in the networking community believe that Software-Defined Networking, in which entire networks are managed centrally, has the potential to revolutionize the field. However, SDN faces several challenges that have prevented its wide-spread adoption. Current SDN technologies, such as OpenFlow, provide powerful and flexible APIs, but can be unreasonably complex for implementing nontrivial network control logic. The generality offered by these low-level abstractions impose no structure on the network, requiring programmers to herd switches themselves, with little guidance. Many researchers argue that SDNs must adopt more structured models, such as Fabric, with an intelligent edge and a fast but simple label-switched core. Our work draws heavily from these ideas. To that end, we propose ToF, a middleware architecture for implementing policies and behaviors from high-level network descriptions on top of a Fabric-like network. We have implemented a prototype using a combination of widely used technologies, such as MPLS, and our own proposed technologies. Based on our results, we reach near linear scalability with respect to the number of addresses routed over the network, all while introducing minimal performance overhead and requiring no changes to packet structure.}, keywords = {Software Defined Networks Routing}, pubstate = {published}, tppubtype = {conference} } Many in the networking community believe that Software-Defined Networking, in which entire networks are managed centrally, has the potential to revolutionize the field. However, SDN faces several challenges that have prevented its wide-spread adoption. Current SDN technologies, such as OpenFlow, provide powerful and flexible APIs, but can be unreasonably complex for implementing nontrivial network control logic. The generality offered by these low-level abstractions impose no structure on the network, requiring programmers to herd switches themselves, with little guidance. Many researchers argue that SDNs must adopt more structured models, such as Fabric, with an intelligent edge and a fast but simple label-switched core. Our work draws heavily from these ideas. To that end, we propose ToF, a middleware architecture for implementing policies and behaviors from high-level network descriptions on top of a Fabric-like network. We have implemented a prototype using a combination of widely used technologies, such as MPLS, and our own proposed technologies. Based on our results, we reach near linear scalability with respect to the number of addresses routed over the network, all while introducing minimal performance overhead and requiring no changes to packet structure. |
Hashemi, Sayed Hadi; Noghabi, Shadi A; Bellessa, John; Campbell, Roy H Toward Fabric: A Middleware Implementing High-level Description Languages on a Fabric-like Network Miscellaneous 2016. @misc{, title = {Toward Fabric: A Middleware Implementing High-level Description Languages on a Fabric-like Network}, author = {Sayed Hadi Hashemi and Shadi A Noghabi and John Bellessa and Roy H Campbell}, doi = {10.1145/2881025.2889487}, year = {2016}, date = {2016-03-01}, pages = {117-118}, publisher = {ACM New York}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Xie, Qiaomin; Pundir, Mayank; Lu, Yi; Abad, Cristina L; Campbell, Roy Pandas: Robust Locality-Aware Scheduling with Stochastic Delay Optimality Journal Article IEEE/ACM Transactions on Networking, PP (99), pp. 1-14, 2016. @article{, title = {Pandas: Robust Locality-Aware Scheduling with Stochastic Delay Optimality}, author = {Qiaomin Xie and Mayank Pundir and Yi Lu and Cristina L Abad and Roy Campbell}, year = {2016}, date = {2016-01-01}, journal = {IEEE/ACM Transactions on Networking}, volume = {PP}, number = {99}, pages = {1-14}, abstract = {Data locality is a fundamental problem to dataparallel applications where data-processing tasks consume different amounts of time and resources at different locations. The problem is especially prominent under stressed conditions such as hot-spots. While replication based on data popularity relieves hot-spots due to contention for a single file, hot-spots caused by skewed node popularity, due to contention for files co-located with each other, are more complex, unpredictable, hence more difficult to deal with. We propose Pandas, a light-weight acceleration engine for data-processing tasks that is robust to changes in load and skewness in node popularity. Pandas is a stochastic delay-optimal algorithm. Trace-driven experiments on Hadoop show that Pandas accelerates the data-processing phase of jobs by 11 times with hot-spots and 2.4 times without hot-spots over existing schedulers. When the difference in processing times due to location is large, such as applicable to the case of memorylocality, the acceleration by Pandas is 22 times.}, keywords = {}, pubstate = {published}, tppubtype = {article} } Data locality is a fundamental problem to dataparallel applications where data-processing tasks consume different amounts of time and resources at different locations. The problem is especially prominent under stressed conditions such as hot-spots. While replication based on data popularity relieves hot-spots due to contention for a single file, hot-spots caused by skewed node popularity, due to contention for files co-located with each other, are more complex, unpredictable, hence more difficult to deal with. We propose Pandas, a light-weight acceleration engine for data-processing tasks that is robust to changes in load and skewness in node popularity. Pandas is a stochastic delay-optimal algorithm. Trace-driven experiments on Hadoop show that Pandas accelerates the data-processing phase of jobs by 11 times with hot-spots and 2.4 times without hot-spots over existing schedulers. When the difference in processing times due to location is large, such as applicable to the case of memorylocality, the acceleration by Pandas is 22 times. |
Hashemi, Sayed Hadi; Noghabi, Shadi A; Gropp, William; Campbell, Roy H Performance Modeling of Distributed Deep Neural Networks Conference abs/1612.00521 , 2016. @conference{, title = {Performance Modeling of Distributed Deep Neural Networks}, author = {Sayed Hadi Hashemi and Shadi A Noghabi and William Gropp and Roy H Campbell}, year = {2016}, date = {2016-01-01}, journal = {CoRR}, volume = {abs/1612.00521}, abstract = {During the past decade, machine learning has become extremely popular and can be found in many aspects of our every day life. Nowayadays with explosion of data while rapid growth of computation capacity, Distributed Deep Neural Networks (DDNNs) which can improve their performance linearly with more computation resources, have become hot and trending. However, there has not been an in depth study of the performance of these systems, and how well they scale. In this paper we analyze CNTK, one of the most commonly used DDNNs, by first building a performance model and then evaluating the system two settings: a small cluster with all nodes in a single rack connected to a top of rack switch, and in large scale using Blue Waters with arbitary placement of nodes. Our main focus was the scalability of the system with respect to adding more nodes. Based on our results, this system has an excessive initialization overhead because of poor I/O utilization which dominates the whole execution time. Because of this, the system does not scale beyond a few nodes (4 in Blue Waters). Additionally, due to a single server-multiple worker design the server becomes a bottleneck after 16 nodes limiting the scalability of the CNTK.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } During the past decade, machine learning has become extremely popular and can be found in many aspects of our every day life. Nowayadays with explosion of data while rapid growth of computation capacity, Distributed Deep Neural Networks (DDNNs) which can improve their performance linearly with more computation resources, have become hot and trending. However, there has not been an in depth study of the performance of these systems, and how well they scale. In this paper we analyze CNTK, one of the most commonly used DDNNs, by first building a performance model and then evaluating the system two settings: a small cluster with all nodes in a single rack connected to a top of rack switch, and in large scale using Blue Waters with arbitary placement of nodes. Our main focus was the scalability of the system with respect to adding more nodes. Based on our results, this system has an excessive initialization overhead because of poor I/O utilization which dominates the whole execution time. Because of this, the system does not scale beyond a few nodes (4 in Blue Waters). Additionally, due to a single server-multiple worker design the server becomes a bottleneck after 16 nodes limiting the scalability of the CNTK. |
2015 |
Yao, Fangzhou ; Chang, Kevin ; Campbell, Roy H Ushio: Analyzing News Media and Public Trends in Twitter Miscellaneous 2015. @misc{, title = {Ushio: Analyzing News Media and Public Trends in Twitter}, author = {Yao, Fangzhou and Chang, Kevin and Campbell, Roy H}, url = {http://www.staffs.ac.uk/personal/engineering_and_technology/eb26/BDSN-2015/}, year = {2015}, date = {2015-12-01}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Liu, Weijie ; Bobba, Rakesh B; Mohan, Sibin ; Campbell, Roy H Inter-flow consistency: A novel SDN update abstraction for supporting inter-flow constraints. Miscellaneous 2015. BibTeX | Tags: @misc{b, title = {Inter-flow consistency: A novel SDN update abstraction for supporting inter-flow constraints.}, author = {Liu, Weijie and Bobba, Rakesh B. and Mohan, Sibin and Campbell, Roy H.}, year = {2015}, date = {2015-09-01}, pages = {469-478}, publisher = {IEEE}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Liu, Weijie ; Bobba, Rakesh B; Mohan, Sibin ; Campbell, Roy H Inter-Flow Consistency: Novel SDN Update Abstraction for Supporting Inter-Flow Constraints Miscellaneous 2015. @misc{b, title = {Inter-Flow Consistency: Novel SDN Update Abstraction for Supporting Inter-Flow Constraints}, author = {Liu, Weijie and Bobba, Rakesh B. and Mohan, Sibin and Campbell, Roy H.}, url = {http://www.internetsociety.org/doc/inter-flow-consistency-novel-sdn-update-abstraction-supporting-inter-flow-constraints}, year = {2015}, date = {2015-02-01}, publisher = {Internet Society}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Stephens, Zachary D; Lee, Skylar Y; Faghri, Faraz ; Campbell, Roy H; Zhai, Chengxiang ; Efron, Miles J; Iyer, Ravishankar ; Schatz, Michael C; Sinha, Saurabh ; Robinson, Gene E Big Data: Astronomical or Genomical? Journal Article PLoS Biol, 13(7): e1002195. , 2015. @article{b, title = {Big Data: Astronomical or Genomical?}, author = {Stephens, Zachary D. and Lee, Skylar Y. and Faghri, Faraz and Campbell, Roy H. and Zhai, Chengxiang and Efron, Miles J. and Iyer, Ravishankar and Schatz, Michael C. and Sinha, Saurabh and Robinson, Gene E.}, doi = {10.1371/journal.pbio.1002195}, year = {2015}, date = {2015-01-01}, journal = {PLoS Biol}, volume = {13(7): e1002195.}, keywords = {}, pubstate = {published}, tppubtype = {article} } |
Pundir, Mayank ; Leslie, Luke M; Gupta, Indranil ; Campbell, Roy H Zorro: zero-cost reactive failure recovery in distributed graph processing Inproceedings SoCC'15 Proceedings of the Sixth ACM Symposium on Cloud Computing, pp. 195-208, ACM New York, NY, USA ©2015, 2015, ISBN: 978-1-4503-3651-2. @inproceedings{b, title = {Zorro: zero-cost reactive failure recovery in distributed graph processing}, author = {Pundir, Mayank and Leslie, Luke M. and Gupta, Indranil and Campbell, Roy H.}, doi = {10.1145/2806777.2806934}, isbn = {978-1-4503-3651-2}, year = {2015}, date = {2015-01-01}, booktitle = {SoCC'15 Proceedings of the Sixth ACM Symposium on Cloud Computing}, pages = {195-208}, publisher = {ACM New York, NY, USA ©2015}, keywords = {}, pubstate = {published}, tppubtype = {inproceedings} } |
Peng, Boyang ; Hosseini, Mohammad ; Hong, Zhihao ; Farivar, Reza ; Campbell, Roy R-Storm: Resource-Aware Scheduling in Storm Miscellaneous 2015. BibTeX | Tags: @misc{b, title = {R-Storm: Resource-Aware Scheduling in Storm}, author = {Peng, Boyang and Hosseini, Mohammad and Hong, Zhihao and Farivar, Reza and Campbell, Roy}, year = {2015}, date = {2015-01-01}, pages = {149-161}, keywords = {}, pubstate = {published}, tppubtype = {misc} } |
Palmer, Imani ; Wood, Elaine ; Nagy, Stefan ; Garcia, Gabriela ; Bashir, Masooda; Campbell, Roy Digital Forensics Education: A Multidisciplinary Curriculum Model Book Chapter James, Joshua I; Breitinger, Frank (Ed.): Digital Forensics and Cyber Crime: 7th International Conference, ICDF2C 2015, 157 , pp. 3-15, Springer International Publishing, Seoul, South Korea, 2015, ISBN: 978-3-319-25512-5. @inbook{b, title = {Digital Forensics Education: A Multidisciplinary Curriculum Model}, author = {Palmer, Imani and Wood, Elaine and Nagy, Stefan and Garcia, Gabriela and Masooda Bashir and Campbell, Roy}, editor = {James, Joshua I. and Breitinger, Frank}, doi = {10.1007/978-3-319-25512-5_1}, isbn = {978-3-319-25512-5}, year = {2015}, date = {2015-01-01}, booktitle = {Digital Forensics and Cyber Crime: 7th International Conference, ICDF2C 2015}, volume = {157}, pages = {3-15}, publisher = {Springer International Publishing}, address = {Seoul, South Korea}, series = {Lecture Notes of the Institute for Computer Sciences, Social Informatics and Telecommunications Engineering}, keywords = {}, pubstate = {published}, tppubtype = {inbook} } |
Cai, Chris; Le, Franck; Sun, Xin; Xie, Geoffrey BYOO: Build Your Own Overlay Technical Report 2015. @techreport{byoo, title = {BYOO: Build Your Own Overlay}, author = {Chris Cai and Franck Le and Xin Sun and Geoffrey Xie}, url = {http://srg.cs.illinois.edu/wp-content/uploads/2016/04/techReport.pdf}, year = {2015}, date = {2015-01-01}, keywords = {}, pubstate = {published}, tppubtype = {techreport} } |
2014 |
Verma, Abhishek; Cherkasova, Ludmila; Campbell, Roy H Profiling and Evaluating Hardware Choices for MapReduce Environments:<br/>an Application-Aware Approach Conference Proc. of the 32nd Intl. Symposium on Computer Performance, Modeling, Measurements, and Evaluation, (IFIP WG 7.3 Performncetextquoteright2014), Turin, Italy, 2014. Abstract | BibTeX | Tags: benchmarking, MapReduce, performance modeling @conference{175, title = {Profiling and Evaluating Hardware Choices for MapReduce Environments: an Application-Aware Approach}, author = {Abhishek Verma and Ludmila Cherkasova and Roy H Campbell}, year = {2014}, date = {2014-10-01}, booktitle = {Proc. of the 32nd Intl. Symposium on Computer Performance, Modeling, Measurements, and Evaluation, (IFIP WG 7.3 Performncetextquoteright2014)}, address = {Turin, Italy}, abstract = {The core business of many companies depends on the timely analysis of large quantities of new data. MapReduce clusters that routinely process petabytes of data represent a new entity in the evolving landscape of clouds and data centers. During the lifetime of a datacenter, old hardware needs to be eventually replaced by new hardware. The hardware selection process needs to be driven by performance objectives of the existing production workloads. In this work, we present a general framework, called Ariel, that automates system administratorstextquoteright eorts for evaluating different hardware choices and predicting completion times of MapReduce applications for their migration to a Hadoop cluster based on the new hardware. The proposed framework consists of two key components: i) a set of microbenchmarks to profile the MapReduce processing pipeline on a given platform, and ii) a regression-based model that establishes a performance relationship between the source and target platforms. Benchmarking and model derivation can be done using a small test cluster based on new hardware. However, the designed model can be used for predicting the jobstextquoteright completion time on a large Hadoop cluster and be applied for its sizing to achieve desirable service level objectives (SLOs). We validate the effectiveness of the proposed approach using a set of twelve realistic MapReduce applications and three different hardware platforms. The evaluation study justifes our design choices and shows that the derived model accurately predicts performance of the test applications. The predicted completion times of eleven applications (out of twelve) are within 10% of the measured completion times on the target platforms.}, keywords = {benchmarking, MapReduce, performance modeling}, pubstate = {published}, tppubtype = {conference} } The core business of many companies depends on the timely analysis of large quantities of new data.<br/>MapReduce clusters that routinely process petabytes of data represent a new entity in the evolving landscape<br/>of clouds and data centers. During the lifetime of a datacenter, old hardware needs to be eventually replaced<br/>by new hardware. The hardware selection process needs to be driven by performance objectives of the<br/>existing production workloads. In this work, we present a general framework, called Ariel, that automates<br/>system administratorstextquoteright eorts for evaluating different hardware choices and predicting completion times of<br/>MapReduce applications for their migration to a Hadoop cluster based on the new hardware. The proposed<br/>framework consists of two key components: i) a set of microbenchmarks to profile the MapReduce processing<br/>pipeline on a given platform, and ii) a regression-based model that establishes a performance relationship<br/>between the source and target platforms. Benchmarking and model derivation can be done using a small<br/>test cluster based on new hardware. However, the designed model can be used for predicting the jobstextquoteright<br/>completion time on a large Hadoop cluster and be applied for its sizing to achieve desirable service level<br/>objectives (SLOs). We validate the effectiveness of the proposed approach using a set of twelve realistic<br/>MapReduce applications and three different hardware platforms. The evaluation study justifes our design<br/>choices and shows that the derived model accurately predicts performance of the test applications. The<br/>predicted completion times of eleven applications (out of twelve) are within 10% of the measured completion<br/>times on the target platforms. |
Yao, Fangzhou; Spraybery, Read T; Campbell, Roy H CryptVMI: a Flexible and Encrypted Virtual Machine Introspection in the Cloud Conference Proceedings of the Second International Workshop on Security in Cloud Computing, ACM New York ACM New York, Kyoto, Japan, 2014. @conference{172, title = {CryptVMI: a Flexible and Encrypted Virtual Machine Introspection in the Cloud}, author = {Fangzhou Yao and Spraybery, Read T. and Roy H Campbell}, year = {2014}, date = {2014-06-01}, booktitle = {Proceedings of the Second International Workshop on Security in Cloud Computing}, pages = {11-18}, publisher = {ACM New York}, address = {Kyoto, Japan}, organization = {ACM New York}, abstract = {Virtualization has demonstrated its importance in both public and private cloud computing solutions. In such environments, multiple virtual instances run on the same physical machine concurrently. Thus, the isolation in the system is not guaranteed by the physical infrastructure anymore. Reliance on logical isolation makes a system vulnerable to attacks. Thus, Virtual Machine Introspection techniques become essential,since they simplify the process to acquire evidence for further analysis in this complex system. However, Virtual Machine Introspection tools for the cloud are usually written specifically for a single system and do not provide a standard interface to work with other security monitoring systems. Moreover, this technique breaks down the borders of the segregation between multiple tenants, which should be avoided in a public cloud computing environment. In this paper, we focus on building a flexible and encrypted Virtual Machine Introspection system, CryptVMI, to address the above concerns. Our approach maintains a client application on the user end to send queries to the cloud, as well as parse the results returned in a standard form. We also have a handler that cooperates with an introspection application in the cloud infrastructure to process queries and return encrypted results. This work shows our design and implementation of this system, and the benchmark results prove that it does not incur much performance overhead.}, keywords = {}, pubstate = {published}, tppubtype = {conference} } Virtualization has demonstrated its importance in both public and private cloud computing solutions. In such environments, multiple virtual instances run on the same physical machine concurrently. Thus, the isolation in the system is not guaranteed by the physical infrastructure anymore. Reliance on logical isolation makes a system vulnerable to attacks. Thus, Virtual Machine Introspection techniques become essential,since they simplify the process to acquire evidence for further analysis in this complex system. However, Virtual Machine Introspection tools for the cloud are usually written specifically for a single system and do not provide a standard interface to work with other security monitoring systems. Moreover, this technique breaks down the borders of the segregation between multiple tenants, which should be avoided in a public cloud computing environment. In this paper, we focus on building a flexible and encrypted Virtual Machine Introspection system, CryptVMI, to address the above concerns. Our approach maintains a client application on the user end to send queries to the cloud, as well as parse the results returned in a standard form. We also have a handler that cooperates with an introspection application in the cloud infrastructure to process queries and return encrypted results. This work shows our design and implementation of this system, and the benchmark results prove that it does not incur much performance overhead. |
Huang, Jingwei; Nicol, David M; Campbell, Roy H Denial-of-Service Threat to Hadoop/YARN Clusters with Multi-Tenancy Conference 2014 IEEE Second international Congress on Big Data (BigData Congress 2014), IEEE IEEE, Anchorage,AK,USA, 2014. BibTeX | Tags: @conference{176, title = {Denial-of-Service Threat to Hadoop/YARN Clusters with Multi-Tenancy}, author = {Jingwei Huang and David M. Nicol and Roy H. Campbell}, year = {2014}, date = {2014-06-01}, booktitle = {2014 IEEE Second international Congress on Big Data (BigData Congress 2014)}, publisher = {IEEE}, address = {Anchorage,AK,USA}, organization = {IEEE}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Abad, Cristina L; Lu, Yi; Campbell, Roy H; Roberts, Nathan A Model-Based Namespace Metadata Benchmark for HDFS Conference International Conference on Autonomic Computing, USENIX USENIX, Philidelphia, 2014. BibTeX | Tags: @conference{177, title = {A Model-Based Namespace Metadata Benchmark for HDFS}, author = {Cristina L. Abad and Yi Lu and Roy H. Campbell and Nathan Roberts}, year = {2014}, date = {2014-06-01}, booktitle = {International Conference on Autonomic Computing}, publisher = {USENIX}, address = {Philidelphia}, organization = {USENIX}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Abad, Cristina Big Data Storage Workload Characterization, Modeling and Synthetic Generation Masters Thesis University of Illinois at Urbana-Champaign, 2014. BibTeX | Tags: @mastersthesis{589, title = {Big Data Storage Workload Characterization, Modeling and Synthetic Generation}, author = {Cristina Abad}, year = {2014}, date = {2014-05-01}, volume = {Ph.D.}, address = {Urbana, Illinois}, school = {University of Illinois at Urbana-Champaign}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } |
Schmidt, Maxie A Computer Algebra Package for Polynomial Sequence Recognition Masters Thesis University of Illinois, 2014. BibTeX | Tags: @mastersthesis{586, title = {A Computer Algebra Package for Polynomial Sequence Recognition}, author = {Maxie Schmidt}, year = {2014}, date = {2014-05-01}, volume = {MS}, address = {Urbana, Illinois}, school = {University of Illinois}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } |
Lang, Anthony A New Portable Digital Forensics Curriculum Masters Thesis University of Illinois, 2014. BibTeX | Tags: @mastersthesis{587, title = {A New Portable Digital Forensics Curriculum}, author = {Anthony Lang}, year = {2014}, date = {2014-05-01}, volume = {MS}, address = {Urbana, Illinois}, school = {University of Illinois}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } |
Chu, Jonathan The Triple Pot and Techniques in Distributed System Call Intrusion Detection Masters Thesis University of Illinois, 2014. BibTeX | Tags: @mastersthesis{588, title = {The Triple Pot and Techniques in Distributed System Call Intrusion Detection}, author = {Jonathan Chu}, year = {2014}, date = {2014-05-01}, volume = {MS}, address = {Urbana, Illinois}, school = {University of Illinois}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } |
Yao, Fangzhou; Campbell, Roy H CouchFS: A High-Performance File System for Large Data Sets Conference IEEE Proceedings of the 3rd International Congress on Big Data 2014, Anchorage, Alaska, 2014. BibTeX | Tags: @conference{174, title = {CouchFS: A High-Performance File System for Large Data Sets}, author = {Fangzhou Yao and Roy H. Campbell}, year = {2014}, date = {2014-01-01}, booktitle = {IEEE Proceedings of the 3rd International Congress on Big Data 2014}, address = {Anchorage, Alaska}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Yao, Fangzhou; Campbell, Roy H CryptVMI: Encrypted Virtual Machine Introspection in the Cloud Conference 2014 IEEE Seventh International Conference on Cloud Computing (Cloud 2014), IEEE IEEE, Anchorage, Alaska, 2014. BibTeX | Tags: @conference{173, title = {CryptVMI: Encrypted Virtual Machine Introspection in the Cloud}, author = {Fangzhou Yao and Roy H Campbell}, year = {2014}, date = {2014-01-01}, booktitle = {2014 IEEE Seventh International Conference on Cloud Computing (Cloud 2014)}, publisher = {IEEE}, address = {Anchorage, Alaska}, organization = {IEEE}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Lamps, Jereme WinWizard: Expanding Xen with a LibVMI Intrusion Detection Tool Proceeding 2014. BibTeX | Tags: @proceedings{804, title = {WinWizard: Expanding Xen with a LibVMI Intrusion Detection Tool}, author = {Jereme Lamps}, editor = {Imani Palmer}, year = {2014}, date = {2014-01-01}, journal = {IEEE Sixth International Conference on Cloud Computing}, keywords = {}, pubstate = {published}, tppubtype = {proceedings} } |
2013 |
Malik, Muhammad Salman; Berthier, Robin; Bobba, Rekesh B; Campbell, Roy H; Sanders, William H Formal Design of Communication Checkers for ICCP using UPPAAL Conference IEEE International Conference on Smart Grid Communications (SmartGridComm), Vancouver, Canada, 2013. BibTeX | Tags: @conference{184, title = {Formal Design of Communication Checkers for ICCP using UPPAAL}, author = {Muhammad Salman Malik and Robin Berthier and Rekesh B. Bobba and Roy H. Campbell and William H. Sanders}, year = {2013}, date = {2013-10-01}, booktitle = {IEEE International Conference on Smart Grid Communications (SmartGridComm)}, pages = {486-491}, address = {Vancouver, Canada}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Abad, Cristina L; Yuan, Mindi ; Cai, Chris X; Lu, Yi ; Roberts, Nathan ; Campbell, Roy H Generating request streams on Big Data using clustered renewal processes Journal Article Performance EvaluationPerformance EvaluationPerformance Evaluation, 70 (10), pp. 704-719, 2013, ISBN: 01665316, (Sp. Iss. SI<br/>238MX<br/>Times Cited:0<br/>Cited References Count:29). Abstract | BibTeX | Tags: big data, hdfs, popularity, storage, temporal locality, workload generation @article{188, title = {Generating request streams on Big Data using clustered renewal processes}, author = {Abad, Cristina L. and Yuan, Mindi and Cai, Chris X. and Lu, Yi and Roberts, Nathan and Campbell, Roy H.}, isbn = {01665316}, year = {2013}, date = {2013-10-01}, journal = {Performance EvaluationPerformance EvaluationPerformance Evaluation}, volume = {70}, number = {10}, pages = {704-719}, abstract = {The performance evaluation of large file systems, such as storage and media streaming, motivates scalable generation of representative traces. We focus on two key characteristics of traces, popularity and temporal locality. The common practice of using a system-wide distribution obscures per-object behavior, which is important for system evaluation. We propose a model based on delayed renewal processes which, by sampling interarrival times for each object, accurately reproduces popularity and temporal locality for the trace. A lightweight version reduces the dimension of the model with statistical clustering. It is workload-agnostic and object type-aware, suitable for testing emerging workloads and textquoterightwhat-iftextquoteright scenarios. We implemented a synthetic trace generator and validated it using: (1) a Big Data storage (HDFS) workload from Yahoo!, (2) a trace from a feature animation company, and (3) a streaming media workload. Two case studies in caching and replicated distributed storage systems show that our traces produce application-level results similar to the real workload. The trace generator is fast and readily scales to a system of 4.3 million Files. It outperforms existing models in terms of accurately reproducing the characteristics of the real trace. (C) 2013 Elsevier B.V. All rights reserved.}, note = {Sp. Iss. SI<br/>238MX<br/>Times Cited:0<br/>Cited References Count:29}, keywords = {big data, hdfs, popularity, storage, temporal locality, workload generation}, pubstate = {published}, tppubtype = {article} } The performance evaluation of large file systems, such as storage and media streaming, motivates scalable generation of representative traces. We focus on two key characteristics of traces, popularity and temporal locality. The common practice of using a system-wide distribution obscures per-object behavior, which is important for system evaluation. We propose a model based on delayed renewal processes which, by sampling interarrival times for each object, accurately reproduces popularity and temporal locality for the trace. A lightweight version reduces the dimension of the model with statistical clustering. It is workload-agnostic and object type-aware, suitable for testing emerging workloads and textquoterightwhat-iftextquoteright scenarios. We implemented a synthetic trace generator and validated it using: (1) a Big Data storage (HDFS) workload from Yahoo!, (2) a trace from a feature animation company, and (3) a streaming media workload. Two case studies in caching and replicated distributed storage systems show that our traces produce application-level results similar to the real workload. The trace generator is fast and readily scales to a system of 4.3 million Files. It outperforms existing models in terms of accurately reproducing the characteristics of the real trace. (C) 2013 Elsevier B.V. All rights reserved. |
Montanari, Mirko Limiting Information Exposure in Multi-Domain Monitoring Systems Masters Thesis University of Illinois at Urbana-Champaign, 2013. BibTeX | Tags: @mastersthesis{590, title = {Limiting Information Exposure in Multi-Domain Monitoring Systems}, author = {Mirko Montanari}, year = {2013}, date = {2013-08-01}, volume = {Ph.D.}, address = {Urbana, Illinois}, school = {University of Illinois at Urbana-Champaign}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } |
textasciidieresis an, Gyorgy D; Bobba, Rakesh B; Gross, George; Campbell, Roy H Cloud Computing for the Power Grid: From Service Composition to Assured Clouds Conference USENIX Workshop on Hot Topics in Cloud Computing, USENIX USENIX, San Jose, CA, 2013. BibTeX | Tags: @conference{187, title = {Cloud Computing for the Power Grid: From Service Composition to Assured Clouds}, author = {Gyorgy D textasciidieresis an and Rakesh B. Bobba and George Gross and Roy H. Campbell}, year = {2013}, date = {2013-06-01}, booktitle = {USENIX Workshop on Hot Topics in Cloud Computing}, publisher = {USENIX}, address = {San Jose, CA}, organization = {USENIX}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Marsan, Robert Android Behind the Scenes: Revealing Hidden Malware with Andromeda Masters Thesis University of Illinois, 2013. BibTeX | Tags: @mastersthesis{591, title = {Android Behind the Scenes: Revealing Hidden Malware with Andromeda}, author = {Robert Marsan}, year = {2013}, date = {2013-05-01}, volume = {MS}, address = {Urbana, Illinois}, school = {University of Illinois}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } |
Kharbanda, Harshit Software Systems for Power and Energy Conservation Masters Thesis University of Illinois, 2013. BibTeX | Tags: @mastersthesis{592, title = {Software Systems for Power and Energy Conservation}, author = {Harshit Kharbanda}, year = {2013}, date = {2013-05-01}, volume = {MS}, address = {Urbana, Illinois}, school = {University of Illinois}, keywords = {}, pubstate = {published}, tppubtype = {mastersthesis} } |
Huh, Jun Ho ; Montanari, Mirko ; Dagit, Derek ; Bobba, Rakesh B; Kim, Dongwook ; Choi, Yoonjoo ; Campbell, Roy H Assessing software integrity of virtual appliances through software whitelists Conference NDSS, 2013. BibTeX | Tags: @conference{186, title = {Assessing software integrity of virtual appliances through software whitelists}, author = {Huh, Jun Ho and Montanari, Mirko and Dagit, Derek and Bobba, Rakesh B. and Kim, Dongwook and Choi, Yoonjoo and Campbell, Roy H.}, year = {2013}, date = {2013-01-01}, booktitle = {NDSS}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Montanari, Mirko ; Chan, Ellick ; Larson, Kevin ; Yoo, Wucherl ; Campbell, Roy H Distributed security policy conformance Conference Computers & Security, 33 , 2013, ISBN: 01674048. BibTeX | Tags: @conference{182, title = {Distributed security policy conformance}, author = {Montanari, Mirko and Chan, Ellick and Larson, Kevin and Yoo, Wucherl and Campbell, Roy H.}, isbn = {01674048}, year = {2013}, date = {2013-01-01}, booktitle = {Computers & Security}, volume = {33}, pages = {28-40}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |
Montanari, Mirko ; Huh, Jun Ho ; Bobba, Rakesh B; Campbell, Roy H Limiting Data Exposure in Monitoring Multi-domain Policy Conformance Conference TRUST, IEEE IEEE, Imperial College, London, 2013. BibTeX | Tags: @conference{181, title = {Limiting Data Exposure in Monitoring Multi-domain Policy Conformance}, author = {Montanari, Mirko and Huh, Jun Ho and Bobba, Rakesh B. and Campbell, Roy H.}, year = {2013}, date = {2013-01-01}, booktitle = {TRUST}, pages = {65-82}, publisher = {IEEE}, address = {Imperial College, London}, organization = {IEEE}, keywords = {}, pubstate = {published}, tppubtype = {conference} } |