bibliography.bib

@book{latex,
  title = {LaTeX : A Documentation Preparation System User's Guide and Reference Manual},
  publisher = {Addison-Wesley Professional},
  year = {1994},
  author = {Leslie Lamport}
}

@article{Kroemer2019,
  abstract = {A key challenge in intelligent robotics is creating robots that are capable of directly interacting with the world around them to achieve their goals. The last decade has seen substantial growth in research on the problem of robot manipulation, which aims to exploit the increasing availability of affordable robot arms and grippers to create robots capable of directly interacting with the world to achieve their goals. Learning will be central to such autonomous systems, as the real world contains too much variation for a robot to expect to have an accurate model of its environment, the objects in it, or the skills required to manipulate them, in advance. We aim to survey a representative subset of that research which uses machine learning for manipulation. We describe a formalization of the robot manipulation learning problem that synthesizes existing research into a single coherent framework and highlight the many remaining research opportunities and challenges.},
  archivePrefix = {arXiv},
  arxivId = {1907.03146},
  author = {Kroemer, Oliver and Niekum, Scott and Konidaris, George},
  eprint = {1907.03146},
  file = {:Users/barisyazici/Desktop/Grasping/A Review of Robot Learning for Manipulation- Challenges, Representations, and Algorithms.pdf:pdf},
  mendeley-groups = {Grasping},
  title = {{A Review of Robot Learning for Manipulation: Challenges, Representations, and Algorithms}},
  url = {http://arxiv.org/abs/1907.03146},
  year = {2019}
}

@book{Sutton2018,
  abstract = {Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives when interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the key ideas and algorithms of reinforcement learning. Their discussion ranges from the history of the field's intellectual foundations to the most recent developments and applications. The only necessary mathematical background is familiarity with elementary concepts of probability.The book is divided into three parts. Part I defines the reinforcement learning problem in terms of Markov decision processes. Part II provides basic solution methods: dynamic programming, Monte Carlo methods, and temporal-difference learning. Part III presents a unified view of the solution methods and incorporates artificial neural networks, eligibility traces, and planning; the two final chapters present case studies and consider the future of reinforcement learning.},
  author = {Sutton, Richard S. and Barto, Andrew G.},
  booktitle = {The MIT Press},
  file = {:Users/barisyazici/Desktop/books/ReinforcementSutton.pdf:pdf},
  isbn = {9780262039246},
  mendeley-groups = {RL general},
  pages = {1--3},
  title = {{Reinforcement Learning, Second Edition: An Introduction - Complete Draft}},
  year = {2018}
}

@article{AnimalInt11,
  title = {Animal Intelligence: Experimental Studies.},
  publisher = {New York: The Macmillan Co.; London: Macmillan and Co., Ltd., 1911.},
  year = {1911.},
  author = { E. L. Thorndike.}
}

@book{Gagniuc2017,
  abstract = {around examples based on objects such as jars (representing states) and balls (representing transition probabilities) of various colors. Thus, any type of Markov chain configuration is explained in terms of real experiments. These examples relate to each other from chapter to chapter enabling a gradual under- standing of the phenomena. The theory is also accompanied by an algorithm implementation for each example. Chapter 1 begins with a general introduction into the history of probability theory, covering different time periods. In this chapter, the introduction to discrete-time is made using quantifiable examples showing howthe field ofprobability theory arrived in recent times at the notion of dependent variables (Markov model) from experiments related to indepen- dent variables (Bernoulli model). Chapters 2 and 3 are an introduction to simple stochastic matrices and transition probabilities followed by a simulation of a two-state Markov chain. The description starts from the observation of events within a system up to the simulation of that system through a Markov chain. The construction of a stochastic matrix is shown based on both a sequence of observations and observations provided in percentages. Chapter 4 begins with an introduction to predictions that use a two-state Markov chain. Here, the notion of steady state is first approached and discussed in connection with the long-run distribution behavior of the Markov chain. Chapter 5 describes some examples by considering predictions based on Markov chains with more than two states. The first two examples include a three-state Markov chain and a four-state Markov chain, after which a gradual generalization is made for an arbitrary number of states (n-states). In Chapter 6, the notion of absorbing Markov chains it is approached by using tangible examples. Chapter 7 covers a topic linked to the average time spent in a state, whereas Chapter 8 covers dis- cussions on different configurations of chains. Different configurations of state diagrams provide solutions for different problems in many fields. As a continu- ation, Chapter 9 covers the simulation ofan n-state Markov chain used for veri- fying experiments ofvarious diagram configurations. Overall, the book intends a completely different approach on Markov chains, which is based on four convergent lines that include mathematics, implementation, simulation, and experimentation.},
  author = {Gagniuc, Paul A},
  file = {:Users/barisyazici/Downloads/Markov chains from theory to implementation and experimentation by Gagniuc, Paul A (z-lib.org).pdf:pdf},
  isbn = {9781119387572},
  mendeley-groups = {RL general},
  publisher = {Wiley},
  title = {{From Theory to Implementation and Experimentation}},
  year = {2017}
}

@unpublished{PerezMIT,
  author = {Tomás Lozano-Pérez, and Leslie Kaelbling},
  title  = {6.825 Techniques in Artificial Intelligence (SMA 5504)},
  note   = {Massachusetts Institute of Technology: MIT OpenCourseWare, https://ocw.mit.edu},
  year   = {Fall 2002},
  url    = {https://ocw.mit.edu/courses/electrical-engineering-and-computer-science/6-825-techniques-in-artificial-intelligence-sma-5504-fall-2002/index.htm#}
}

@article{Bellman1958,
  abstract = {Consider a system S specified at any time t by a finite dimensional vector x(t) satisfying a vector differential equation dx/dt = g[x, r(t), f(t)], x(0) = c, where c is the initial state, r(t) is a random forcing term possessing a known distribution, and f(t) is a forcing term chosen, via a feedback process, so as to minimize the expected value of a functional J(x) = f{\{}hook{\}}0T h(x - y, t) dG(t), where y(t) is a known function, or chosen so as to minimize the functional defined by the probability that max0 ≦ t ≦ T h(x - y, t) exceed a specified bound. It is shown how the functional equation technique of dynamic programming may be used to obtain a new computational and analytic approach to problems of this genre. The limited memory capacity of present-day digital computers limits the routine application of these techniques to first and second order systems at the moment, with limited application to higher order systems. {\textcopyright} 1958.},
  author = {Bellman, Richard},
  doi = {10.1016/S0019-9958(58)80003-0},
  file = {:Users/barisyazici/Downloads/1-s2.0-S0019995858800030-main.pdf:pdf},
  issn = {00199958},
  journal = {Information and Control},
  mendeley-groups = {RL general},
  number = {3},
  pages = {228--239},
  title = {{Dynamic programming and stochastic control processes}},
  volume = {1},
  year = {1958}
}

@book{Borovcnik1991,
  abstract = {Generative probability models such as hidden Markov models provide$\backslash$na principled way of treating missing information and dealing with$\backslash$nvariable length sequences. On the other hand, discriminative $\backslash$n$\backslash$nmethods such as support vector machines enable us to construct flexible$\backslash$ndecision boundaries and often result in classification performance$\backslash$nsuperior to that of the model based approaches. An ideal classifier$\backslash$nshould combine these two complementary approaches. In this paper,$\backslash$nwe develop a natural way of achieving this combination by deriving$\backslash$nkernel functions for use in discriminative methods such as support$\backslash$nvector machines from generative probability models. We provide a$\backslash$ntheoretical justication for this combination as well as demonstrate$\backslash$na substantial improvement in the classifcation performance in the$\backslash$ncontext of DNA and protein sequence analysis.},
  author = {Borovcnik, Manfred and Bentz, Hans-Joachim and Kapadia, Ramesh},
  booktitle = {Chance Encounters: Probability in Education},
  doi = {10.1007/978-94-011-3532-0_2},
  file = {:Users/barisyazici/Desktop/docs/TUM/donemler/2017W/MachineLearning/Books/Murphy.pdf:pdf},
  isbn = {9780262018029},
  mendeley-groups = {Machine Learning},
  pages = {27--71},
  title = {{A Probabilistic Perspective}},
  year = {1991}
}

@book{Bishop06,
  author = {Bishop, Christopher M.},
  title = {Pattern Recognition and Machine Learning (Information Science and Statistics)},
  year = {2006},
  isbn = {0387310738},
  publisher = {Springer-Verlag},
  address = {Berlin, Heidelberg}
}
  
@book{Murphy12,
  author = {Murphy, Kevin P.},
  title = {Machine Learning: A Probabilistic Perspective},
  year = {2012},
  isbn = {0262018020},
  publisher = {The MIT Press}
}
  

@article{SpinningUp2018,
    author = {Achiam, Joshua},
    title = {{Spinning Up in Deep Reinforcement Learning}},
    year = {2018}
}

@article{Lin1993,
  abstract = {Reinforcement learning agents are adaptive, reactive, and self-supervised. The aim of this dissertation is to extend the state of the art of reinforcement learning and enable its applications to complex robot-learning problems. In particular, it focuses on two issues. First, learning from sparse and delayed reinforcement signals is hard and in general a slow process. Techniques for reducing learning time must be devised. Second, most existing reinforcement learning methods assume that the world is a Markov decision process. This assumption is too strong for many robot tasks of interest.$\backslash$r$\backslash$nThis dissertation demonstrates how we can possibly overcome the slow learning problem and tackle non-Markovian environments, making reinforcement learning more practical for realistic robot tasks: (1) Reinforcement learning can be naturally integrated with artificial neural networks to obtain high-quality generalization, resulting in a significant learning speedup. Neural networks are used in this dissertation, and they generalize effectively even in the presence of noise and a large of binary and real-valued inputs. (2) Reinforcement learning agents can save many learning trials by using an action model, which can be learned on-line. With a model, an agent can mentally experience the effects of its actions without actually executing them. Experience replay is a simple technique that implements this idea, and is shown to be effective in reducing the number of action executions required. (3) Reinforcement learning agents can take advantage of instructive training instances provided by human teachers, resulting in a significant learning speedup. Teaching can also help learning agents avoid local optima during the search for optimal control. Simulation experiments indicate that even a small amount of teaching can save agents many learning trials. (4) Reinforcement learning agents can significantly reduce learning time by hierarchical learning--they first solve elementary learning problems and then combine solutions to the elementary problems to solve a complex problem. Simulation experiments indicate that a robot with hierarchical learning can solve a complex problem, which otherwise is hardly solvable within a reasonable time. (5) Reinforcement learning agents can deal with a wide range of non-Markovian environments by having a memory of their past. Three memory architectures are discussed. They work reasonably well for a variety of simple problems. One of them is also successfully applied to a nontrivial non-Markovian robot task.$\backslash$r$\backslash$n$\backslash$r$\backslash$nThe results of this dissertation rely on computer simulation, including (1) an agent operating in a dynamic and hostile environment and (2) a mobile robot operating in a noisy and non-Markovian environment. The robot simulator is physically realistic. This dissertation concludes that it is possible to build artificial agents than can acquire complex control policies effectively by reinforcement learning.},
  author = {Lin, Long-Ji},
  file = {:Users/barisyazici/Desktop/Lin1993.pdf:pdf},
  journal = {PhD Thesis},
  keywords = {Q-learning,neural networks (NN),robot simulation},
  mendeley-groups = {DQN},
  pages = {160},
  title = {{Reinforcement learning for robots using neural networks}},
  url = {https://search.proquest.com/docview/303995826?accountid=12063{\%}0Ahttp://fg2fy8yh7d.search.serialssolutions.com/directLink?{\&}atitle=Reinforcement+learning+for+robots+using+neural+networks{\&}author=Lin{\%}2C+Long-Ji{\&}issn={\&}title=Reinforcement+learning+for+robots+us},
  year = {1993}
}

@article{Mnih,
  abstract = {We present the first deep learning model to successfully learn control policies directly from high-dimensional sensory input using reinforcement learning. The model is a convolutional neural network, trained with a variant of Q-learning, whose input is raw pixels and whose output is a value function estimating future rewards. We apply our method to seven Atari 2600 games from the Arcade Learning Environment, with no adjustment of the architecture or learning algorithm. We find that it outperforms all previous approaches on six of the games and surpasses a human expert on three of them.},
  author = {Mnih, Volodymyr and Silver, David and Riedmiller, Martin},
  file = {:Users/barisyazici/Desktop/ActionBranchingQ-Learning/DQN Extensions/dqn.pdf:pdf},
  mendeley-groups = {DQN},
  pages = {1--9},
  title = {{Deep Q Network (Google)}}
}

@article{Bengio2009,
  abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and gradually more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them "curriculum learning". In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions). Copyright 2009.},
  author = {Bengio, Yoshua and Louradour, J{\'{e}}r̂ome and Collobert, Ronan and Weston, Jason},
  doi = {10.1145/1553374.1553380},
  file = {:Users/barisyazici/Desktop/Curriculum{\_}learning.pdf:pdf},
  isbn = {9781605585161},
  journal = {ACM International Conference Proceeding Series},
  mendeley-groups = {Curriculum},
  number = {January 2009},
  title = {{Curriculum learning}},
  volume = {382},
  year = {2009}
}
@article{Sahbani2012,
  abstract = {This overview presents computational algorithms for generating 3D object grasps with autonomous multi-fingered robotic hands. Robotic grasping has been an active research subject for decades, and a great deal of effort has been spent on grasp synthesis algorithms. Existing papers focus on reviewing the mechanics of grasping and the fingerobject contact interactions Bicchi and Kumar (2000) [12] or robot hand design and their control Al-Gallaf et al. (1993) [70]. Robot grasp synthesis algorithms have been reviewed in Shimoga (1996) [71], but since then an important progress has been made toward applying learning techniques to the grasping problem. This overview focuses on analytical as well as empirical grasp synthesis approaches. {\textcopyright} 2011 Elsevier B.V. All rights reserved.},
  author = {Sahbani, A. and El-Khoury, S. and Bidaud, P.},
  doi = {10.1016/j.robot.2011.07.016},
  file = {:Users/barisyazici/Downloads/Survey.pdf:pdf},
  issn = {09218890},
  journal = {Robotics and Autonomous Systems},
  keywords = {Force-closure,Grasp synthesis,Learning by demonstration,Task modeling},
  mendeley-groups = {ManipulationGeneralReview},
  number = {3},
  pages = {326--336},
  publisher = {Elsevier B.V.},
  title = {{An overview of 3D object grasp synthesis algorithms}},
  url = {http://dx.doi.org/10.1016/j.robot.2011.07.016},
  volume = {60},
  year = {2012}
}
@article{Nguyen1987,
  author = {Nguyen, Van-duc},
  file = {:Users/barisyazici/Downloads/01088008.pdf:pdf},
  journal = {Fortune},
  mendeley-groups = {ManipulationGeneralReview},
  pages = {234--239},
  title = {{Constructing Stable Grasps in 3D}},
  year = {1987}
}
@article{Schmidt2018,
  abstract = {We present a data-driven, bottom-up, deep learning approach to robotic grasping of unknown objects using Deep Convolutional Neural Networks (DCNNs). The approach uses depth images of the scene as its sole input for synthesis of a single-grasp solution during execution, adequately portraying the robot's visual perception during exploration of a scene. The training input consists of precomputed high-quality grasps, generated by analytical grasp planners, accompanied with rendered depth images of the training objects. In contrast to previous work on applying deep learning techniques to robotic grasping, our approach is able to handle full end-effector poses and therefore approach directions other than the view direction of the camera. Furthermore, the approach is not limited to a certain grasping setup (e. g. parallel jaw gripper) by design. We evaluate the method regarding its force-closure performance in simulation using the KIT and YCB object model datasets as well as a big data grasping database. We demonstrate the performance of our approach in qualitative grasping experiments on the humanoid robot ARMAR-III.},
  author = {Schmidt, Philipp and Vahrenkamp, Nikolaus and Wachter, Mirko and Asfour, Tamim},
  doi = {10.1109/ICRA.2018.8463204},
  file = {:Users/barisyazici/Downloads/Schmidt2018.pdf:pdf},
  isbn = {9781538630815},
  issn = {10504729},
  journal = {Proceedings - IEEE International Conference on Robotics and Automation},
  mendeley-groups = {ManipulationGeneralReview,LearningGrasping},
  pages = {6831--6838},
  title = {{Grasping of Unknown Objects Using Deep Convolutional Neural Networks Based on Depth Images}},
  year = {2018}
}

@article{Ekvall2004,
  abstract = {We describe our effort in development of an artificial cognitive system, able of performing complex manipulation tasks in a teleoperated or collaborative manner. Some of the work is motivated by human control strategies that, in general, involve comparison between sensory feedback and a-priori known, internal models. According to recent neuroscientific findings, predictions help to reduce the delays in obtaining the sensory information and to perform more complex tasks. This paper deals with the issue of robotic manipulation and grasping in particular. Two main contributions of the paper are: i) evaluation, recognition and modeling of human grasps during the arm transportation sequence, and ii) learning and representation of grasp strategies for different robotic hands.},
  author = {Ekvall, Staffan and Kragic, Danica},
  doi = {10.1109/robot.2004.1308798},
  file = {:Users/barisyazici/Downloads/01308798.pdf:pdf},
  isbn = {0780382323},
  issn = {10504729},
  journal = {Proceedings - IEEE International Conference on Robotics and Automation},
  mendeley-groups = {ManipulationGeneralReview},
  number = {4},
  pages = {3519--3524},
  title = {{Interactive grasp learning based on human demonstration}},
  volume = {2004},
  year = {2004}
}

@article{Saxena2008,
  abstract = {We consider the problem of grasping novel objects, specifically objects that are being seen for the first time through vision. Grasping a previously unknown object, one for which a 3-d model is not available, is a challenging problem. Furthermore, even if given a model, one still has to decide where to grasp the object. We present a learning algorithm that neither requires nor tries to build a 3-d model of the object. Given two (or more) images of an object, our algorithm attempts to identify a few points in each image corresponding to good locations at which to grasp the object. This sparse set of points is then triangulated to obtain a 3-d location at which to attempt a grasp. This is in contrast to standard dense stereo, which tries to triangulate every single point in an image (and often fails to return a good 3-d model). Our algorithm for identifying grasp locations from an image is trained by means of supervised learning, using synthetic images for the training set. We demonstrate this approach on two robotic manipulation platforms. Our algorithm successfully grasps a wide variety of objects, such as plates, tape rolls, jugs, cellphones, keys, screwdrivers, staplers, a thick coil of wire, a strangely shaped power horn and others, none of which were seen in the training set. We also apply our method to the task of unloading items from dishwashers.},
  author = {Saxena, Ashutosh and Driemeyer, Justin and Ng, Andrew Y.},
  doi = {10.1177/0278364907087172},
  file = {:Users/barisyazici/Downloads/IJRR{\_}saxena{\_}etal{\_}roboticgraspingofnovelobjects.pdf:pdf},
  issn = {02783649},
  journal = {International Journal of Robotics Research},
  keywords = {Grasping,Learning and adaptive systems,Perception,Personal robots,Robotics,Vision of grasping},
  mendeley-groups = {ManipulationGeneralReview},
  number = {2},
  pages = {157--173},
  title = {{Robotic grasping of novel objects using vision}},
  volume = {27},
  year = {2008}
}

@article{Kalashnikov2018,
  abstract = {In this paper, we study the problem of learning vision-based dynamic manipulation skills using a scalable reinforcement learning approach. We study this problem in the context of grasping, a longstanding challenge in robotic manipulation. In contrast to static learning behaviors that choose a grasp point and then execute the desired grasp, our method enables closed-loop vision-based control, whereby the robot continuously updates its grasp strategy based on the most recent observations to optimize long-horizon grasp success. To that end, we introduce QT-Opt, a scalable self-supervised vision-based reinforcement learning framework that can leverage over 580k real-world grasp attempts to train a deep neural network Q-function with over 1.2M parameters to perform closed-loop, real-world grasping that generalizes to 96{\%} grasp success on unseen objects. Aside from attaining a very high success rate, our method exhibits behaviors that are quite distinct from more standard grasping systems: using only RGB vision-based perception from an over-the-shoulder camera, our method automatically learns regrasping strategies, probes objects to find the most effective grasps, learns to reposition objects and perform other non-prehensile pre-grasp manipulations, and responds dynamically to disturbances and perturbations.},
  archivePrefix = {arXiv},
  arxivId = {1806.10293},
  author = {Kalashnikov, Dmitry and Irpan, Alex and Pastor, Peter and Ibarz, Julian and Herzog, Alexander and Jang, Eric and Quillen, Deirdre and Holly, Ethan and Kalakrishnan, Mrinal and Vanhoucke, Vincent and Levine, Sergey},
  eprint = {1806.10293},
  file = {:Users/barisyazici/Library/Application Support/Mendeley Desktop/Downloaded/Kalashnikov et al. - 2018 - QT-Opt Scalable Deep Reinforcement Learning for Vision-Based Robotic Manipulation.pdf:pdf},
  keywords = {deep learning,grasping,reinforcement learning},
  mendeley-groups = {RL in Robotic},
  number = {CoRL},
  pages = {1--23},
  title = {{QT-Opt: Scalable Deep Reinforcement Learning for Vision-Based Robotic Manipulation}},
  url = {http://arxiv.org/abs/1806.10293},
  year = {2018}
}

@article{Andrychowicz2020,
  abstract = {We use reinforcement learning (RL) to learn dexterous in-hand manipulation policies that can perform vision-based object reorientation on a physical Shadow Dexterous Hand. The training is performed in a simulated environment in which we randomize many of the physical properties of the system such as friction coefficients and an object's appearance. Our policies transfer to the physical robot despite being trained entirely in simulation. Our method does not rely on any human demonstrations, but many behaviors found in human manipulation emerge naturally, including finger gaiting, multi-finger coordination, and the controlled use of gravity. Our results were obtained using the same distributed RL system that was used to train OpenAI Five. We also include a video of our results: https://youtu.be/jwSbzNHGflM.},
  archivePrefix = {arXiv},
  arxivId = {1808.00177},
  author = {Andrychowicz, Open AI: Marcin and Baker, Bowen and Chociej, Maciek and J{\'{o}}zefowicz, Rafal and McGrew, Bob and Pachocki, Jakub and Petron, Arthur and Plappert, Matthias and Powell, Glenn and Ray, Alex and Schneider, Jonas and Sidor, Szymon and Tobin, Josh and Welinder, Peter and Weng, Lilian and Zaremba, Wojciech},
  doi = {10.1177/0278364919887447},
  eprint = {1808.00177},
  file = {:Users/barisyazici/Downloads/1808.00177.pdf:pdf},
  issn = {17413176},
  journal = {International Journal of Robotics Research},
  keywords = {Dexterous manipulation,adaptive control,humanoid robots,learning and adaptive systems,multifingered hands},
  mendeley-groups = {RL in Robotic},
  number = {1},
  pages = {3--20},
  title = {{Learning dexterous in-hand manipulation}},
  volume = {39},
  year = {2020}
}

@article{Caldera2018,
  abstract = {For robots to attain more general-purpose utility, grasping is a necessary skill to master. Such general-purpose robots may use their perception abilities to visually identify grasps for a given object. A grasp describes how a robotic end-effector can be arranged to securely grab an object and successfully lift it without slippage. Traditionally, grasp detection requires expert human knowledge to analytically form the task-specific algorithm, but this is an arduous and time-consuming approach. During the last five years, deep learning methods have enabled significant advancements in robotic vision, natural language processing, and automated driving applications. The successful results of these methods have driven robotics researchers to explore the use of deep learning methods in task-generalised robotic applications. This paper reviews the current state-of-the-art in regards to the application of deep learning methods to generalised robotic grasping and discusses how each element of the deep learning approach has improved the overall performance of robotic grasp detection. Several of the most promising approaches are evaluated and the most suitable for real-time grasp detection is identified as the one-shot detection method. The availability of suitable volumes of appropriate training data is identified as a major obstacle for effective utilisation of the deep learning approaches, and the use of transfer learning techniques is proposed as a potential mechanism to address this. Finally, current trends in the field and future potential research directions are discussed.},
  author = {Caldera, Shehan and Rassau, Alexander and Chai, Douglas},
  doi = {10.3390/mti2030057},
  file = {:Users/barisyazici/Desktop/mti-02-00057.pdf:pdf},
  issn = {24144088},
  journal = {Multimodal Technologies and Interaction},
  keywords = {Cnn,Convolutional neural networks,Dcnn,Deep convolutional neural networks,Deep learning,Human-robot collaboration,Robot learning,Robotic grasp detection,Robotic grasping,Transfer learning},
  mendeley-groups = {RL in Robotic},
  number = {3},
  title = {{Review of deep learning methods in robotic grasp detection}},
  volume = {2},
  year = {2018}
}

@article{Lenz2013,
  abstract = {We consider the problem of detecting robotic grasps in an RGB-D view of a scene containing objects. In this work, we apply a deep learning approach to solve this problem, which avoids time-consuming hand-design of features. This presents two main challenges. First, we need to evaluate a huge number of candidate grasps. In order to make detection fast, as well as robust, we present a two-step cascaded structure with two deep networks, where the top detections from the first are re-evaluated by the second. The first network has fewer features, is faster to run, and can effectively prune out unlikely candidate grasps. The second, with more features, is slower but has to run only on the top few detections. Second, we need to handle multimodal inputs well, for which we present a method to apply structured regularization on the weights based on multimodal group regularization. We demonstrate that our method outperforms the previous state-of-the-art methods in robotic grasp detection, and can be used to successfully execute grasps on two different robotic platforms.},
  archivePrefix = {arXiv},
  arxivId = {1301.3592},
  author = {Lenz, Ian and Lee, Honglak and Saxena, Ashutosh},
  eprint = {1301.3592},
  file = {:Users/barisyazici/Library/Application Support/Mendeley Desktop/Downloaded/Lenz, Lee, Saxena - 2013 - Deep Learning for Detecting Robotic Grasps.pdf:pdf},
  mendeley-groups = {ManipulationGeneralReview,LearningGrasping},
  month = {jan},
  title = {{Deep Learning for Detecting Robotic Grasps}},
  url = {http://arxiv.org/abs/1301.3592},
  year = {2013}
}

@article{Mahler2017,
  abstract = {To reduce data collection time for deep learning of robust robotic grasp plans, we explore training from a synthetic dataset of 6.7 million point clouds, grasps, and analytic grasp metrics generated from thousands of 3D models from Dex-Net 1.0 in randomized poses on a table. We use the resulting dataset, Dex-Net 2.0, to train a Grasp Quality Convolutional Neural Network (GQ-CNN) model that rapidly predicts the probability of success of grasps from depth images, where grasps are specified as the planar position, angle, and depth of a gripper relative to an RGB-D sensor. Experiments with over 1,000 trials on an ABB YuMi comparing grasp planning methods on singulated objects suggest that a GQ-CNN trained with only synthetic data from Dex-Net 2.0 can be used to plan grasps in 0.8s with a success rate of 93{\%} on eight known objects with adversarial geometry and is 3× faster than registering point clouds to a precomputed dataset of objects and indexing grasps. The Dex-Net 2.0 grasp planner also has the highest success rate on a dataset of 10 novel rigid objects and achieves 99{\%} precision (one false positive out of 69 grasps classified as robust) on a dataset of 40 novel household objects, some of which are articulated or deformable.},
  archivePrefix = {arXiv},
  arxivId = {1703.09312},
  author = {Mahler, Jeffrey and Liang, Jacky and Niyaz, Sherdil and Laskey, Michael and Doan, Richard and Liu, Xinyu and Ojea, Juan Aparicio and Goldberg, Ken},
  doi = {10.15607/rss.2017.xiii.058},
  eprint = {1703.09312},
  file = {:Users/barisyazici/Downloads/1703.09312.pdf:pdf},
  isbn = {9780992374730},
  issn = {2330765X},
  journal = {Robotics: Science and Systems},
  mendeley-groups = {LearningGrasping},
  title = {{Dex-Net 2.0: Deep learning to plan Robust grasps with synthetic point clouds and analytic grasp metrics}},
  volume = {13},
  year = {2017}
}

@article{Quillen2018,
  abstract = {In this paper, we explore deep reinforcement learning algorithms for vision-based robotic grasping. Model-free deep reinforcement learning (RL) has been successfully applied to a range of challenging environments, but the proliferation of algorithms makes it difficult to discern which particular approach would be best suited for a rich, diverse task like grasping. To answer this question, we propose a simulated benchmark for robotic grasping that emphasizes off-policy learning and generalization to unseen objects. Off-policy learning enables utilization of grasping data over a wide variety of objects, and diversity is important to enable the method to generalize to new objects that were not seen during training. We evaluate the benchmark tasks against a variety of Q-function estimation methods, a method previously proposed for robotic grasping with deep neural network models, and a novel approach based on a combination of Monte Carlo return estimation and an off-policy correction. Our results indicate that several simple methods provide a surprisingly strong competitor to popular algorithms such as double Q-learning, and our analysis of stability sheds light on the relative tradeoffs between the algorithms 11Accompanying video: https://goo.gl/pyMd6p.},
  archivePrefix = {arXiv},
  arxivId = {1802.10264},
  author = {Quillen, Deirdre and Jang, Eric and Nachum, Ofir and Finn, Chelsea and Ibarz, Julian and Levine, Sergey},
  doi = {10.1109/ICRA.2018.8461039},
  eprint = {1802.10264},
  file = {:Users/barisyazici/Library/Application Support/Mendeley Desktop/Downloaded/Quillen et al. - 2018 - Deep reinforcement learning for vision-based robotic grasping A simulated comparative evaluation of off-policy m.pdf:pdf},
  isbn = {9781538630815},
  issn = {10504729},
  journal = {Proceedings - IEEE International Conference on Robotics and Automation},
  mendeley-groups = {RL in Robotic},
  pages = {6284--6291},
  title = {{Deep reinforcement learning for vision-based robotic grasping: A simulated comparative evaluation of off-policy methods}},
  year = {2018}
}

@article{Breyer2018,
  abstract = {Enabling autonomous robots to interact in unstructured environments with dynamic objects requires manipulation capabilities that can deal with clutter, changes, and objects' variability. This paper presents a comparison of different reinforcement learning-based approaches for object picking with a robotic manipulator. We learn closed-loop policies mapping depth camera inputs to motion commands and compare different approaches to keep the problem tractable, including reward shaping, curriculum learning and using a policy pre-trained on a task with a reduced action set to warm-start the full problem. For efficient and more flexible data collection, we train in simulation and transfer the policies to a real robot. We show that using curriculum learning, policies learned with a sparse reward formulation can be trained at similar rates as with a shaped reward. These policies result in success rates comparable to the policy initialized on the simplified task. We could successfully transfer these policies to the real robot with only minor modifications of the depth image filtering. We found that using a heuristic to warm-start the training was useful to enforce desired behavior, while the policies trained from scratch using a curriculum learned better to cope with unseen scenarios where objects are removed.},
  annote = {- Curriculum Learning
  - Reward shaping},
  archivePrefix = {arXiv},
  arxivId = {1803.04996},
  author = {Breyer, Michel and Furrer, Fadri and Novkovic, Tonci and Siegwart, Roland and Nieto, Juan},
  doi = {10.1109/LRA.2019.2896467},
  eprint = {1803.04996},
  file = {:Users/barisyazici/Desktop/ActionBranchingQ-Learning/Papers/Breyer et al. - 2019 - Comparing task simplifications to learn closed-loop object picking using deep reinforcement learning-annotated.pdf:pdf},
  issn = {23773766},
  journal = {IEEE Robotics and Automation Letters},
  keywords = {Grasping,curriculum learning,deep reinforcement learning,visual servoing},
  mendeley-groups = {Curriculum},
  month = {mar},
  number = {2},
  pages = {1549--1556},
  publisher = {IEEE},
  title = {{Comparing Task Simplifications to Learn Closed-Loop Object Picking Using Deep Reinforcement Learning}},
  url = {http://arxiv.org/abs/1803.04996},
  volume = {4},
  year = {2018}
}

@article{openai2019rubiks,
  title={Solving Rubik's Cube with a Robot Hand},
  author={OpenAI and Ilge Akkaya and Marcin Andrychowicz and Maciek Chociej and Mateusz Litwin and Bob McGrew and Arthur Petron and Alex Paino and Matthias Plappert and Glenn Powell and Raphael Ribas and Jonas Schneider and Nikolas Tezak and Jerry Tworek and Peter Welinder and Lilian Weng and Qiming Yuan and Wojciech Zaremba and Lei Zhang},
  year={2019},
  journal={arXiv preprint},
}

@article{Tobin2017,
  abstract = {Bridging the 'reality gap' that separates simulated robotics from experiments on hardware could accelerate robotic research through improved data availability. This paper explores domain randomization, a simple technique for training models on simulated images that transfer to real images by randomizing rendering in the simulator. With enough variability in the simulator, the real world may appear to the model as just another variation. We focus on the task of object localization, which is a stepping stone to general robotic manipulation skills. We find that it is possible to train a real-world object detector that is accurate to 1.5 cm and robust to distractors and partial occlusions using only data from a simulator with non-realistic random textures. To demonstrate the capabilities of our detectors, we show they can be used to perform grasping in a cluttered environment. To our knowledge, this is the first successful transfer of a deep neural network trained only on simulated RGB images (without pre-training on real images) to the real world for the purpose of robotic control.},
  archivePrefix = {arXiv},
  arxivId = {1703.06907},
  author = {Tobin, Josh and Fong, Rachel and Ray, Alex and Schneider, Jonas and Zaremba, Wojciech and Abbeel, Pieter},
  doi = {10.1109/IROS.2017.8202133},
  eprint = {1703.06907},
  file = {:Users/barisyazici/Downloads/1703.06907.pdf:pdf},
  isbn = {9781538626825},
  issn = {21530866},
  journal = {IEEE International Conference on Intelligent Robots and Systems},
  mendeley-groups = {RL in Robotic},
  pages = {23--30},
  title = {{Domain randomization for transferring deep neural networks from simulation to the real world}},
  volume = {2017-Septe},
  year = {2017}
}


@article{Erez2015,
  abstract = {There is growing need for software tools that can accurately simulate the complex dynamics of modern robots. While a number of candidates exist, the field is fragmented. It is difficult to select the best tool for a given project, or to predict how much effort will be needed and what the ultimate simulation performance will be. Here we introduce new quantitative measures of simulation performance, focusing on the numerical challenges that are typical for robotics as opposed to multi-body dynamics and gaming. We then present extensive simulation results, obtained within a new software framework for instantiating the same model in multiple engines and running side-by-side comparisons. Overall we find that each engine performs best on the type of system it was designed and optimized for: MuJoCo wins the robotics-related tests, while the gaming engines win the gaming-related tests without a clear leader among them. The simulations are illustrated in the accompanying movie.},
  author = {Erez, Tom and Tassa, Yuval and Todorov, Emanuel},
  doi = {10.1109/ICRA.2015.7139807},
  file = {:Users/barisyazici/Downloads/ErezICRA15.pdf:pdf},
  issn = {10504729},
  journal = {Proceedings - IEEE International Conference on Robotics and Automation},
  mendeley-groups = {SimulationComparison},
  number = {June},
  pages = {4397--4404},
  title = {{Simulation tools for model-based robotics: Comparison of Bullet, Havok, MuJoCo, ODE and PhysX}},
  volume = {2015-June},
  year = {2015}
}

@misc{OpenAIgym,
  Author = {Greg Brockman and Vicki Cheung and Ludwig Pettersson and Jonas Schneider and John Schulman and Jie Tang and Wojciech Zaremba},
  Title = {OpenAI Gym},
  Year = {2016},
  Eprint = {arXiv:1606.01540},
}

@MISC{coumans2020,
  author =   {Erwin Coumans and Yunfei Bai},
  title =    {PyBullet, a Python module for physics simulation for games, robotics and machine learning},
  howpublished = {\url{http://pybullet.org}},
  year = {2016--2020}
}

@article{Pitonakova2018,
  abstract = {In this paper, the characteristics and performance of three open-source simulators for robotics, V-REP, Gazebo and ARGoS, are thoroughly analysed and compared. While they all allow for programming in C++, they also represent clear alternatives when it comes to the trade-off between complexity and performance. Attention is given to their built-in features, robot libraries, programming methods and the usability of their user interfaces. Benchmark test results are reported in order to identify how well the simulators can cope with environments of varying complexity. The richness of features of V-REP and the strong performance of Gazebo and ARGoS in complex scenes are highlighted. Various usability issues of Gazebo are also noted.},
  author = {Pitonakova, Lenka and Giuliani, Manuel and Pipe, Anthony and Winfield, Alan},
  doi = {10.1007/978-3-319-96728-8_30},
  file = {:Users/barisyazici/Downloads/simulatorsComparison.pdf:pdf},
  isbn = {9783319967271},
  issn = {16113349},
  journal = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  mendeley-groups = {SimulationComparison},
  pages = {357--368},
  title = {{Feature and performance comparison of the V-REP, Gazebo and ARGoS robot simulators}},
  volume = {10965 LNAI},
  year = {2018}
}

@inproceedings{Gazebo2004,
  abstract = {Simulators have played a critical role in robotics research as tools for quick and efficient testing of new concepts, strategies, and algorithms. To date, most simulators have been restricted to 2D worlds, and few have matured to the point where they are both highly capable and easily adaptable. Gazebo is designed to fill this niche by creating a 3D dynamic multi-robot environment capable of recreating the complex worlds that will be encountered by the next gen- eration of mobile robots. Its open source status, fine grained control, and high fidelity place Gazebo in a unique position to become more than just a stepping stone between the drawing board and real hardware: data visualization, simulation of remote environments, and even reverse engineering of black- box systems are all possible applications. Gazebo is developed in cooperation with the Player and Stage projects [1], [2], [3], and is available from http://playerstage.sourceforge.net/gazebo/ gazebo.html.},
  author = {{Koenig Nathan} and Andrew, Howard},
  booktitle = {International conference on intelligent robot and systesms},
  file = {:Users/barisyazici/Downloads/gazebo.pdf:pdf},
  isbn = {0780384636},
  mendeley-groups = {SimulationComparison},
  pages = {2149--2154},
  title = {{Design and Use Paradigms for Gazebo ,An Open-Source Multi-Robot Simulator}},
  year = {2004}
}

@article{Todorov2012,
  abstract = {We describe a new physics engine tailored to model-based control. Multi-joint dynamics are represented in generalized coordinates and computed via recursive algorithms. Contact responses are computed via efficient new algorithms we have developed, based on the modern velocity-stepping approach which avoids the difficulties with spring-dampers. Models are specified using either a high-level C++ API or an intuitive XML file format. A built-in compiler transforms the user model into an optimized data structure used for runtime computation. The engine can compute both forward and inverse dynamics. The latter are well-defined even in the presence of contacts and equality constraints. The model can include tendon wrapping as well as actuator activation states (e.g. pneumatic cylinders or muscles). To facilitate optimal control applications and in particular sampling and finite differencing, the dynamics can be evaluated for different states and controls in parallel. Around 400,000 dynamics evaluations per second are possible on a 12-core machine, for a 3D homanoid with 18 dofs and 6 active contacts. We have already used the engine in a number of control applications. It will soon be made publicly available. {\textcopyright} 2012 IEEE.},
  author = {Todorov, Emanuel and Erez, Tom and Tassa, Yuval},
  doi = {10.1109/IROS.2012.6386109},
  file = {:Users/barisyazici/Downloads/TodorovIROS12(1).pdf:pdf},
  isbn = {9781467317375},
  issn = {21530858},
  journal = {IEEE International Conference on Intelligent Robots and Systems},
  mendeley-groups = {SimulationComparison},
  pages = {5026--5033},
  title = {{MuJoCo: A physics engine for model-based control}},
  year = {2012}
}

@misc{MujocoPy17,
  author       = {OpenAI},
  title        = {mujoco-py},
  publisher    = {GitHub},
  journal      = {GitHub repository},
  howpublished = {\url{https://github.com/openai/mujoco-py}},
  year         = {2017}
}

@misc{stable-baselines,
  author = {Hill, Ashley and Raffin, Antonin and Ernestus, Maximilian and Gleave, Adam and Kanervisto, Anssi and Traore, Rene and Dhariwal, Prafulla and Hesse, Christopher and Klimov, Oleg and Nichol, Alex and Plappert, Matthias and Radford, Alec and Schulman, John and Sidor, Szymon and Wu, Yuhuai},
  title = {Stable Baselines},
  year = {2018},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/hill-a/stable-baselines}},
}

@misc{StableBullet,
  author       = {Erwin Coumans and Yunfei Bai},
  title        = {PyBullet, a Python module for physics simulation for games, robotics and machine learning},
  publisher    = {GitHub},
  journal      = {GitHub repository},
  howpublished = {\url{https://bit.ly/3jqU7Kd}},
  year         = {2016--2020}
}

@article{Haarnoja2018,
  abstract = {Model-free deep reinforcement learning (RL) algorithms have been demonstrated on a range of challenging decision making and control tasks. However, these methods typically suffer from two major challenges: very high sample complexity and brittle convergence properties, which necessitate meticulous hyperparameter tuning. Both of these challenges severely limit the applicability of such methods to complex, real-world domains. In this paper, we propose soft actor-critic, an off-policy actor-critic deep RL algorithm based on the maximum entropy reinforcement learning framework. In this framework, the actor aims to maximize expected reward while also maximizing entropy-that is, to succeed at the task while acting as randomly as possible. Prior deep RL methods based on this framework have been formulated as Q-learning methods. By combining off-policy updates with a stable stochastic actorcritic formulation, our method achieves state-ofthe-art performance on a range of continuous control benchmark tasks, outperforming prior onpolicy and off-policy methods. Furthermore, we demonstrate that, in contrast to other off-policy algorithms, our approach is very stable, achieving very similar performance across different random seeds.},
  archivePrefix = {arXiv},
  arxivId = {arXiv:1801.01290v2},
  author = {Haarnoja, Tuomas and Zhou, Aurick and Abbeel, Pieter and Levine, Sergey},
  eprint = {arXiv:1801.01290v2},
  file = {:Users/barisyazici/Downloads/1801.01290.pdf:pdf},
  isbn = {9781510867963},
  journal = {35th International Conference on Machine Learning, ICML 2018},
  mendeley-groups = {EntropyBasedAlgs},
  pages = {2976--2989},
  title = {{Soft actor-critic: Off-policy maximum entropy deep reinforcement learning with a stochastic actor}},
  volume = {5},
  year = {2018}
}

@article{Schulman2017,
  abstract = {Two of the leading approaches for model-free reinforcement learning are policy gradient methods and {\$}Q{\$}-learning methods. {\$}Q{\$}-learning methods can be effective and sample-efficient when they work, however, it is not well-understood why they work, since empirically, the {\$}Q{\$}-values they estimate are very inaccurate. A partial explanation may be that {\$}Q{\$}-learning methods are secretly implementing policy gradient updates: we show that there is a precise equivalence between {\$}Q{\$}-learning and policy gradient methods in the setting of entropy-regularized reinforcement learning, that "soft" (entropy-regularized) {\$}Q{\$}-learning is exactly equivalent to a policy gradient method. We also point out a connection between {\$}Q{\$}-learning methods and natural policy gradient methods. Experimentally, we explore the entropy-regularized versions of {\$}Q{\$}-learning and policy gradients, and we find them to perform as well as (or slightly better than) the standard variants on the Atari benchmark. We also show that the equivalence holds in practical settings by constructing a {\$}Q{\$}-learning method that closely matches the learning dynamics of A3C without using a target network or {\$}\backslashepsilon{\$}-greedy exploration schedule.},
  archivePrefix = {arXiv},
  arxivId = {1704.06440},
  author = {Schulman, John and Chen, Xi and Abbeel, Pieter},
  eprint = {1704.06440},
  file = {:Users/barisyazici/Downloads/1704.06440.pdf:pdf},
  mendeley-groups = {EntropyBasedAlgs},
  pages = {1--15},
  title = {{Equivalence Between Policy Gradients and Soft Q-Learning}},
  url = {http://arxiv.org/abs/1704.06440},
  year = {2017}
}

@article{Haarnoja2017,
  abstract = {We propose a method for learning expressive energy-based policies for continuous states and actions, which has been feasible only in tabular domains before. We apply our method to learning maximum entropy policies, resulting into a new algorithm, called soft Q-learning, that expresses the optimal policy via a Boltzmann distribution. We use the recently proposed amortized Stein variational gradient descent to learn a stochastic sampling network that approximates samples from this distribution. The benefits of the proposed algorithm include improved exploration and compositionality that allows transferring skills between tasks, which we confirm in simulated experiments with swimming and walking robots. We also draw a connection to actor-critic methods, which can be viewed performing approximate inference on the corresponding energy-based model.},
  archivePrefix = {arXiv},
  arxivId = {1702.08165},
  author = {Haarnoja, Tuomas and Tang, Haoran and Abbeel, Pieter and Levine, Sergey},
  eprint = {1702.08165},
  file = {:Users/barisyazici/Downloads/haarnoja17a.pdf:pdf},
  isbn = {9781510855144},
  journal = {34th International Conference on Machine Learning, ICML 2017},
  mendeley-groups = {EntropyBasedAlgs},
  pages = {2171--2186},
  title = {{Reinforcement learning with deep energy-based policies}},
  volume = {3},
  year = {2017}
}

@article{Ziebart2010,
  abstract = {Appropriately generalizing predictive algorithms from a few training examples of human behavior is a key challenge for forecasting future behavior. In this thesis, we introduce the principle of maximum causal entropy, which integrates decision theory with information theory to create a novel probabilistic framework for predicting sequences of behavior. This approach matches purposeful measures of behavior while otherwise being as uncertain as possible–all within settings where relevant information is sequentially revealed over time. This provides the worst-case predictive log-loss guarantees of maximum entropy (Gr{\"{u}}nwald {\&} Dawid, 2003) in the reward-based, sequential decision setting of inverse optimal control (Abbeel {\&} Ng, 2004). We derive probabilistic models for decision, control, and multi-player game settings using this approach. We then develop corresponding algorithms for efficient inference that are relaxations of the Bellman equation (Bellman, 1957) and learning algorithms that leverage convexity. We apply the models and algorithms to a number of real world decision prediction tasks. Specifically, we present empirical evaluations of the approach in the domains of vehicle route preference modeling using 100,000 miles of Pittsburgh taxi data, pedestrian motion forecasting from weeks of movement data in the Intel Pittsburgh lab, and robust prediction of game play in stochastic multi-player games.},
  author = {Ziebart, Brian},
  file = {:Users/barisyazici/Downloads/CMU-ML-10-110.pdf:pdf},
  issn = {{\textless}null{\textgreater}},
  journal = {Thesis},
  mendeley-groups = {EntropyBasedAlgs},
  number = {December},
  pages = {1--215},
  title = {{Modeling Purposeful Adaptive Behavior with the Principle of Maximum Causal Entropy}},
  url = {papers2://publication/uuid/AEB1E579-7EC8-4DAD-8B3C-FFE66E8B314F},
  year = {2010}
}

@article{Konda2000,
  abstract = {We propose and analyze a class of actor-critic algorithms for simulation-based optimization of a Markov decision process over a parameterized family of randomized stationary policies. These are two-time-scale algorithms in which the critic uses TD learning with a linear approximation architecture and the actor is updated in an approximate gradient direction based on information provided by the critic. We show that the features for the critic should span a subspace prescribed by the choice of parameterization of the actor. We conclude by discussing convergence properties and some open problems.},
  author = {Konda, Vijay R. and Tsitsiklis, John N.},
  file = {:Users/barisyazici/Downloads/1786-actor-critic-algorithms.pdf:pdf},
  isbn = {0262194503},
  issn = {10495258},
  journal = {Advances in Neural Information Processing Systems},
  mendeley-groups = {Policy Based},
  pages = {1008--1014},
  title = {{Actor-critic algorithms}},
  year = {2000}
}

@article{Tavakoli2018,
  abstract = {Discrete-action algorithms have been central to numerous recent successes of deep reinforcement learning. However, applying these algorithms to high-dimensional action tasks requires tackling the combinatorial increase of the number of possible actions with the number of action dimensions. This problem is further exacerbated for continuous-action tasks that require fine control of actions via discretization. In this paper, we propose a novel neural architecture featuring a shared decision module followed by several network branches, one for each action dimension. This approach achieves a linear increase of the number of network outputs with the number of degrees of freedom by allowing a level of independence for each individual action dimension. To illustrate the approach, we present a novel agent, called Branching Dueling Q-Network (BDQ), as a branching variant of the Dueling Double Deep Q-Network (Dueling DDQN). We evaluate the performance of our agent on a set of challenging continuous control tasks. The empirical results show that the proposed agent scales gracefully to environments with increasing action dimensionality and indicate the significance of the shared decision module in coordination of the distributed action branches. Furthermore, we show that the proposed agent performs competitively against a state-of-the-art continuous control algorithm, Deep Deterministic Policy Gradient (DDPG).},
  archivePrefix = {arXiv},
  arxivId = {1711.08946},
  author = {Tavakoli, Arash and Pardo, Fabio and Kormushev, Petar},
  eprint = {1711.08946},
  file = {:Users/barisyazici/Library/Application Support/Mendeley Desktop/Downloaded/Tavakoli, Pardo, Kormushev - 2018 - Action branching architectures for deep reinforcement learning.pdf:pdf},
  isbn = {9781577358008},
  journal = {32nd AAAI Conference on Artificial Intelligence, AAAI 2018},
  mendeley-groups = {DQN},
  pages = {4131--4138},
  title = {{Action branching architectures for deep reinforcement learning}},
  year = {2018}
}

@article{Lillicrap2016,
  abstract = {We adapt the ideas underlying the success of Deep Q-Learning to the continuous action domain. We present an actor-critic, model-free algorithm based on the deterministic policy gradient that can operate over continuous action spaces. Using the same learning algorithm, network architecture and hyper-parameters, our algorithm robustly solves more than 20 simulated physics tasks, including classic problems such as cartpole swing-up, dexterous manipulation, legged locomotion and car driving. Our algorithm is able to find policies whose performance is competitive with those found by a planning algorithm with full access to the dynamics of the domain and its derivatives. We further demonstrate that for many of the tasks the algorithm can learn policies “end-to-end”: directly from raw pixel inputs.},
  archivePrefix = {arXiv},
  arxivId = {1509.02971},
  author = {Lillicrap, Timothy P. and Hunt, Jonathan J. and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
  eprint = {1509.02971},
  file = {:Users/barisyazici/Library/Application Support/Mendeley Desktop/Downloaded/Lillicrap et al. - 2016 - Continuous control with deep reinforcement learning.pdf:pdf},
  journal = {4th International Conference on Learning Representations, ICLR 2016 - Conference Track Proceedings},
  mendeley-groups = {RL Baselines},
  title = {{Continuous control with deep reinforcement learning}},
  year = {2016}
}

@article{Wang2016,
  abstract = {In recent years there have been many successes of using deep representations in reinforcement learning. Still, many of these applications use conventional architectures, such as convolutional networks, LSTMs, or auto-encoders. In this paper, we present a new neural network architecture for model-free reinforcement learning. Our dueling network represents two separate estimators: one for the state value function and one for the state-dependent action advantage function. The main benefit of this factoring is to generalize learning across actions without imposing any change to the underlying reinforcement learning algorithm. Our results show that this architecture leads to better policy evaluation in the presence of many similar-valued actions. Moreover, the dueling architecture enables our RL agent to outperform the state-of-the-art on the Atari 2600 domain.},
  archivePrefix = {arXiv},
  arxivId = {1511.06581},
  author = {Wang, Ziyu and Schaul, Tom and Hessel, Matteo and {Van Hasselt}, Hado and Lanctot, Marc and {De Frcitas}, Nando},
  eprint = {1511.06581},
  file = {:Users/barisyazici/Downloads/1511.06581.pdf:pdf},
  isbn = {9781510829008},
  journal = {33rd International Conference on Machine Learning, ICML 2016},
  mendeley-groups = {DQN},
  number = {9},
  pages = {2939--2947},
  title = {{Dueling Network Architectures for Deep Reinforcement Learning}},
  volume = {4},
  year = {2016}
}

@misc{Keras,
  title={Keras},
  author={Chollet, Fran\c{c}ois and others},
  year={2015},
  howpublished={\url{https://keras.io}},
}

@article{Tensoflow,
  abstract = {TensorFlow is a machine learning system that operates at large scale and in heterogeneous environments. Tensor-Flow uses dataflow graphs to represent computation, shared state, and the operations that mutate that state. It maps the nodes of a dataflow graph across many machines in a cluster, and within a machine across multiple computational devices, including multicore CPUs, generalpurpose GPUs, and custom-designed ASICs known as Tensor Processing Units (TPUs). This architecture gives flexibility to the application developer: whereas in previous "parameter server" designs the management of shared state is built into the system, TensorFlow enables developers to experiment with novel optimizations and training algorithms. TensorFlow supports a variety of applications, with a focus on training and inference on deep neural networks. Several Google services use TensorFlow in production, we have released it as an open-source project, and it has become widely used for machine learning research. In this paper, we describe the TensorFlow dataflow model and demonstrate the compelling performance that Tensor-Flow achieves for several real-world applications.},
  archivePrefix = {arXiv},
  arxivId = {1605.08695},
  author = {Abadi, Mart{\'{i}}n and Barham, Paul and Chen, Jianmin and Chen, Zhifeng and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Irving, Geoffrey and Isard, Michael and Kudlur, Manjunath and Levenberg, Josh and Monga, Rajat and Moore, Sherry and Murray, Derek G. and Steiner, Benoit and Tucker, Paul and Vasudevan, Vijay and Warden, Pete and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang},
  eprint = {1605.08695},
  file = {:Users/barisyazici/Downloads/1605.08695.pdf:pdf},
  isbn = {9781931971331},
  journal = {Proceedings of the 12th USENIX Symposium on Operating Systems Design and Implementation, OSDI 2016},
  mendeley-groups = {Machine Learning},
  pages = {265--283},
  title = {{TensorFlow: A system for large-scale machine learning}},
  year = {2016}
}

@article{PyTorch,
  abstract = {This paper presents the design, implementation, and evaluation of the PyTorch distributed data parallel module. PyTorch is a widely-adopted scientific computing package used in deep learning research and applications. Recent advances in deep learning argue for the value of large datasets and large models, which necessitates the ability to scale out model training to more computational resources. Data parallelism has emerged as a popular solution for distributed training thanks to its straightforward principle and broad applicability. In general, the technique of distributed data parallelism replicates the model on every computational resource to generate gradients independently and then communicates those gradients at each iteration to keep model replicas consistent. Despite the conceptual simplicity of the technique, the subtle dependencies between computation and communication make it non-trivial to optimize the distributed training efficiency. As of v1.5, PyTorch natively provides several techniques to accelerate distributed data parallel, including bucketing gradients, overlapping computation with communication, and skipping gradient synchronization. Evaluations show that, when configured appropriately, the PyTorch distributed data parallel module attains near-linear scalability using 256 GPUs.},
  archivePrefix = {arXiv},
  arxivId = {2006.15704},
  author = {Li, Shen and Zhao, Yanli and Varma, Rohan and Salpekar, Omkar and Noordhuis, Pieter and Li, Teng and Paszke, Adam and Smith, Jeff and Vaughan, Brian and Damania, Pritam and Chintala, Soumith},
  eprint = {2006.15704},
  file = {:Users/barisyazici/Downloads/2006.15704.pdf:pdf},
  mendeley-groups = {Machine Learning},
  title = {{PyTorch Distributed: Experiences on Accelerating Data Parallel Training}},
  url = {http://arxiv.org/abs/2006.15704},
  year = {2020}
}

@article{Horace,
  author = {He, Horace},
  title = {The State of Machine Learning Frameworks in 2019},
  journal = {The Gradient},
  year = {2019},
  howpublished = {\url{https://thegradient.pub/state-of-ml-frameworks-2019-pytorch-dominates-research-tensorflow-dominates-industry/ } },
}

@misc{stable-baselines3,
  author = {Raffin, Antonin and Hill, Ashley and Ernestus, Maximilian and Gleave, Adam and Kanervisto, Anssi and Dormann, Noah},
  title = {Stable Baselines3},
  year = {2019},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/DLR-RM/stable-baselines3}},
}

@article{optuna_2019,
    title={Optuna: A Next-generation Hyperparameter Optimization Framework},
    author={Akiba, Takuya and Sano, Shotaro and Yanase, Toshihiko and Ohta, Takeru and Koyama, Masanori},
    booktitle={Proceedings of the 25rd {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining},
    year={2019}
}

@article{Li2018,
  abstract = {Modern learning models are characterized by large hyperparameter spaces and long training times. These properties, coupled with the rise of parallel computing and the growing demand to productionize machine learning workloads, motivate the need to develop mature hyperparameter optimization functionality in distributed computing settings. We address this challenge by first introducing a simple and robust hyperparameter optimization algorithm called ASHA, which exploits parallelism and aggressive early-stopping to tackle large-scale hyperparameter optimization problems. Our extensive empirical results show that ASHA outperforms existing state-of-the-art hyperparameter optimization methods; scales linearly with the number of workers in distributed settings; and is suitable for massive parallelism, as demonstrated on a task with 500 workers. We then describe several design decisions we encountered, along with our associated solutions, when integrating ASHA in Determined AI's end-to-end production-quality machine learning system that offers hyperparameter tuning as a service.},
  archivePrefix = {arXiv},
  arxivId = {1810.05934},
  author = {Li, Liam and Jamieson, Kevin and Rostamizadeh, Afshin and Gonina, Ekaterina and Hardt, Moritz and Recht, Benjamin and Talwalkar, Ameet},
  eprint = {1810.05934},
  file = {:Users/barisyazici/Downloads/1810.05934.pdf:pdf},
  mendeley-groups = {Machine Learning},
  title = {{A System for Massively Parallel Hyperparameter Tuning}},
  url = {http://arxiv.org/abs/1810.05934},
  year = {2018}
}

@inproceedings{Karpathy2012,
  abstract = {Humans and animals acquire their wide repertoire of motor skills through an incremental learning process, during which progressively more complex skills are acquired and subsequently integrated with prior abilities. Inspired by this general idea, we develop an approach for learning motor skills based on a two-level curriculum. At the high level, the curriculum specifies an order in which different skills should be learned. At the low level, the curriculum defines a process for learning within a skill. We develop a set of integrated motor skills for a planar articulated figure capable of doing parameterized hops, flips, rolls, and acrobatic sequences. The same curriculum can be applied to yield individualized motor skill sets for articulated figures of varying proportions. {\textcopyright} 2012 Springer-Verlag.},
  author = {Karpathy, Andrej and {Van De Panne}, Michiel},
  booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  doi = {10.1007/978-3-642-30353-1_31},
  isbn = {9783642303524},
  issn = {03029743},
  title = {{Curriculum learning for motor skills}},
  year = {2012}
}

@article{Elman1993,
  abstract = {It is a striking fact that in humans the greatest learnmg occurs precisely at that point in time - childhood - when the most dramatic maturational changes also occur. This report describes possible synergistic interactions between maturational change and the ability to learn a complex domain (language), as investigated in con- nectionist networks. The networks are trained to process complex sentences involving relative clauses, number agreement, and several types of verb argument structure. Training fails in the case of networks which are fully formed and ‘adultlike' in their capacity. Training succeeds only when networks begin with limited working memory and gradually ‘mature' to the adult state. This result suggests that rather than being a limitation, developmental restrictions on resources may constitute a necessary prerequisite for mastering certain complex domains. Specifically, successful learning may depend on starting small.},
  author = {Elman, L. Jeffrey},
  doi = {10.1016/S0010-0277(02)00106-3},
  file = {:Users/barisyazici/Downloads/10.1.1.126.5922.pdf:pdf},
  issn = {00100277},
  journal = {Cognition},
  keywords = {Catastrophic forgetting,Grammar learning,Infant acquisition,Neural networks},
  mendeley-groups = {Curriculum},
  number = {2},
  pages = {71--99},
  title = {{Learning and development in neural networks - the importance of starting small}},
  volume = {38},
  year = {1993}
}

@article{Sanger1994,
  author = {Sanger, Terence D},
  file = {:Users/barisyazici/Library/Application Support/Mendeley Desktop/Downloaded/Sanger - 1994 - Gradually Increasing Task Difficulty.pdf:pdf},
  journal = {IEEE Transactions on Robotics},
  mendeley-groups = {ManipulationGeneralReview,Curriculum},
  number = {3},
  title = {{Gradually Increasing Task Difficulty}},
  volume = {10},
  year = {1994}
}

@inproceedings{Karpathy2012,
  abstract = {Humans and animals acquire their wide repertoire of motor skills through an incremental learning process, during which progressively more complex skills are acquired and subsequently integrated with prior abilities. Inspired by this general idea, we develop an approach for learning motor skills based on a two-level curriculum. At the high level, the curriculum specifies an order in which different skills should be learned. At the low level, the curriculum defines a process for learning within a skill. We develop a set of integrated motor skills for a planar articulated figure capable of doing parameterized hops, flips, rolls, and acrobatic sequences. The same curriculum can be applied to yield individualized motor skill sets for articulated figures of varying proportions. {\textcopyright} 2012 Springer-Verlag.},
  author = {Karpathy, Andrej and {Van De Panne}, Michiel},
  booktitle = {Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)},
  doi = {10.1007/978-3-642-30353-1_31},
  isbn = {9783642303524},
  issn = {03029743},
  mendeley-groups = {Curriculum},
  title = {{Curriculum learning for motor skills}},
  year = {2012}
}

@article{Schlegel2015,
  abstract = {With the increasing use of GPUs in science and the resulting computational power, tasks which were too complex a few years back can now be realized and executed in a reasonable time. Deep Machine learning is one of these tasks. In this paper the authors will provide a brief introduction to Neural Networks and the tools used to describe them. Then they will deduce the benefits and issues of running Neural Networks on GPUs.},
  author = {Schlegel, Daniel},
  file = {:Users/barisyazici/Desktop/2014-Daniel{\_}Schlegel.pdf:pdf},
  journal = {Seminar Talk-Deep Machine Learning on Gpus},
  keywords = {Artificial In-telligence,GPU Computing,Index Terms-Deep Machine Learning},
  mendeley-groups = {Machine Learning},
  pages = {1},
  title = {{Deep Machine Learning on GPUs}},
  url = {http://yann.lecun.com/exdb/mnist/},
  year = {2015}
}

@article{Lillicrap2016,
  abstract = {We adapt the ideas underlying the success of Deep Q-Learning to the continuous action domain. We present an actor-critic, model-free algorithm based on the deterministic policy gradient that can operate over continuous action spaces. Using the same learning algorithm, network architecture and hyper-parameters, our algorithm robustly solves more than 20 simulated physics tasks, including classic problems such as cartpole swing-up, dexterous manipulation, legged locomotion and car driving. Our algorithm is able to find policies whose performance is competitive with those found by a planning algorithm with full access to the dynamics of the domain and its derivatives. We further demonstrate that for many of the tasks the algorithm can learn policies “end-to-end”: directly from raw pixel inputs.},
  archivePrefix = {arXiv},
  arxivId = {1509.02971},
  author = {Lillicrap, Timothy P. and Hunt, Jonathan J. and Pritzel, Alexander and Heess, Nicolas and Erez, Tom and Tassa, Yuval and Silver, David and Wierstra, Daan},
  eprint = {1509.02971},
  file = {:Users/barisyazici/Library/Application Support/Mendeley Desktop/Downloaded/Lillicrap et al. - 2016 - Continuous control with deep reinforcement learning.pdf:pdf},
  journal = {4th International Conference on Learning Representations, ICLR 2016 - Conference Track Proceedings},
  mendeley-groups = {RL Baselines,RL general},
  title = {{Continuous control with deep reinforcement learning}},
  year = {2016}
}