diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..f2f0487a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,32 @@
+build/
+__pycache__/
+
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..474628ee
--- /dev/null
+++ b/README.md
@@ -0,0 +1,65 @@
+# MangoHud
+
+A modification of the Mesa vulkan overlay. Personal improvements including temperature reporting and logging capabilities.
+
+#### Comparison (outdated)
+![](assets/overlay_comparison.gif)
+
+# Installation
+- Arch linux: [PKGBUILD](https://github.com/flightlessmango/PKGBUILDS/blob/master/mangohud/PKGBUILD)
+
+# Normal usage
+
+To enable the MangoHud vulkan overlay layer, run :
+
+`MANGOHUD=1 /path/to/my_vulkan_app`
+
+Or alternatively, add `MANGOHUD=1` to your shell profile.
+
+## MANGOHUD_CONFIG parameters
+
+You can customize the hud by using the MANGOHUD_CONFIG environment variable while separating different options with a comma.
+
+- `cpu_temp`  :  Displays current CPU temperature
+- `gpu_temp`  :  Displays current GPU temperature
+- `core_load` :  Displays current CPU load per core
+- `font_size` :  Changes the default font size (default is 24)
+- `width`     :  Set custom hud width
+- `height`    :  Set custom hud height
+- `position=x`:  Available values for `x` include `top-left`, `top-right`, `bottom-left`, and `bottom-right`
+
+Note: Width and Height are set automatically based on the font_size, but can be overridden.
+
+Example: `MANGOHUD_CONFIG=cpu_temp,gpu_temp,position=top-right,height=500,font_size=32`
+
+## Environment Variables
+- `MANGOHUD_OUTPUT` : Define name and location of the output file (Required for logging)
+- `MANGOHUD_FONT`: Change default font (set location to .TTF/.OTF file )
+
+## Keybindings
+- `F2` : Toggle Logging
+- `F12`: Toggle Hud
+
+## MangoHud fps logging
+
+When you toggle logging (using the keybind `F2`), a file is created with your chosen name (using `MANGOHUD_OUTPUT`) plus a date & timestamp.
+
+This file can be uploaded to [Flightlessmango.com](https://flightlessmango.com/games/user_benchmarks) to create graphs automatically.
+you can share the created page with others, just link it.
+
+#### Multiple log files
+
+It's possible to upload multiple files when using [Flightlessmango.com](https://flightlessmango.com/games/user_benchmarks). You can rename them to your preferred names and upload them in a batch.
+These filenames will be used as the legend in the graph.
+
+#### Log uploading walkthrough
+
+![](assets/log_upload_example.gif)
+
+# Notable changes
+- Removed hud decoration [90a2212](https://github.com/flightlessmango/mesa/commit/90a2212055a8047d46d0220d5fdc30a76900aaed)
+- Changed frametime graph to Lines instead of Histogram [e40533b](https://github.com/flightlessmango/mesa/commit/e40533b7f46858e5b9f08829e789277b2364d5d1)
+- Set static min/max ms on frametime graph to act like Afterburners graph [df5238f](https://github.com/flightlessmango/mesa/commit/df5238f990218f5d6e698d572b05ddd19e52b108)
+- Added CPU/GPU usage (Only Nvidia and AMD)
+- Changed font to UbuntuMono-Bold [73f0aa9](https://github.com/flightlessmango/mesa/commit/73f0aa94d382365205a4a4128d82208315b0b190)
+- Increased hud font size [b7d238b](https://github.com/flightlessmango/mesa/commit/b7d238b07eb82153f272d34bf7d1353b701f32e0)
diff --git a/assets/log_upload_example.gif b/assets/log_upload_example.gif
new file mode 100644
index 00000000..474425b9
Binary files /dev/null and b/assets/log_upload_example.gif differ
diff --git a/assets/overlay_comparison.gif b/assets/overlay_comparison.gif
new file mode 100644
index 00000000..00cf7e2b
Binary files /dev/null and b/assets/overlay_comparison.gif differ
diff --git a/bin/gen_enum_to_str.py b/bin/gen_enum_to_str.py
new file mode 100644
index 00000000..c4f8b262
--- /dev/null
+++ b/bin/gen_enum_to_str.py
@@ -0,0 +1,447 @@
+# encoding=utf-8
+# Copyright © 2017 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+"""Create enum to string functions for vulkan using vk.xml."""
+
+from __future__ import print_function
+import argparse
+import os
+import textwrap
+import xml.etree.cElementTree as et
+
+from mako.template import Template
+
+COPYRIGHT = textwrap.dedent(u"""\
+    * Copyright © 2017 Intel Corporation
+    *
+    * Permission is hereby granted, free of charge, to any person obtaining a copy
+    * of this software and associated documentation files (the "Software"), to deal
+    * in the Software without restriction, including without limitation the rights
+    * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    * copies of the Software, and to permit persons to whom the Software is
+    * furnished to do so, subject to the following conditions:
+    *
+    * The above copyright notice and this permission notice shall be included in
+    * all copies or substantial portions of the Software.
+    *
+    * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    * SOFTWARE.""")
+
+C_TEMPLATE = Template(textwrap.dedent(u"""\
+    /* Autogenerated file -- do not edit
+     * generated by ${file}
+     *
+     ${copyright}
+     */
+
+    #include <string.h>
+    #include <vulkan/vulkan.h>
+    #include <vulkan/vk_android_native_buffer.h>
+    #include "../src/mesa/util/macros.h"
+    #include "vk_enum_to_str.h"
+
+    % for enum in enums:
+
+      % if enum.guard:
+#ifdef ${enum.guard}
+      % endif
+    const char *
+    vk_${enum.name[2:]}_to_str(${enum.name} input)
+    {
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wswitch"
+        switch(input) {
+        % for v in sorted(enum.values.keys()):
+            case ${v}:
+                return "${enum.values[v]}";
+        % endfor
+        }
+        #pragma GCC diagnostic pop
+        unreachable("Undefined enum value.");
+    }
+
+      % if enum.guard:
+#endif
+      % endif
+    %endfor
+
+    size_t vk_structure_type_size(const struct VkBaseInStructure *item)
+    {
+        #pragma GCC diagnostic push
+        #pragma GCC diagnostic ignored "-Wswitch"
+        switch(item->sType) {
+    % for struct in structs:
+        % if struct.extension is not None and struct.extension.define is not None:
+    #ifdef ${struct.extension.define}
+        case ${struct.stype}: return sizeof(${struct.name});
+    #endif
+        % else:
+        case ${struct.stype}: return sizeof(${struct.name});
+        % endif
+    %endfor
+        }
+        #pragma GCC diagnostic pop
+        unreachable("Undefined struct type.");
+    }
+
+    void vk_load_instance_commands(VkInstance instance,
+                                   PFN_vkGetInstanceProcAddr gpa,
+                                   struct vk_instance_dispatch_table *table)
+    {
+        memset(table, 0, sizeof(*table));
+        table->GetInstanceProcAddr = gpa;
+    % for cmd in commands:
+        % if not cmd.device_entrypoint and cmd.name != 'vkGetInstanceProcAddr':
+            % if cmd.extension is not None and cmd.extension.define is not None:
+    #ifdef ${cmd.extension.define}
+        table->${cmd.name[2:]} = (PFN_${cmd.name}) gpa(instance, "${cmd.name}");
+    #endif
+            % else:
+        table->${cmd.name[2:]} = (PFN_${cmd.name}) gpa(instance, "${cmd.name}");
+            % endif
+        % endif
+    %endfor
+    }
+
+    void vk_load_device_commands(VkDevice device,
+                                 PFN_vkGetDeviceProcAddr gpa,
+                                 struct vk_device_dispatch_table *table)
+    {
+        memset(table, 0, sizeof(*table));
+        table->GetDeviceProcAddr = gpa;
+    % for cmd in commands:
+        % if cmd.device_entrypoint and cmd.name != 'vkGetDeviceProcAddr':
+            % if cmd.extension is not None and cmd.extension.define is not None:
+    #ifdef ${cmd.extension.define}
+        table->${cmd.name[2:]} = (PFN_${cmd.name}) gpa(device, "${cmd.name}");
+    #endif
+            % else:
+        table->${cmd.name[2:]} = (PFN_${cmd.name}) gpa(device, "${cmd.name}");
+            % endif
+        % endif
+    %endfor
+    }
+    """),
+    output_encoding='utf-8')
+
+H_TEMPLATE = Template(textwrap.dedent(u"""\
+    /* Autogenerated file -- do not edit
+     * generated by ${file}
+     *
+     ${copyright}
+     */
+
+    #ifndef MESA_VK_ENUM_TO_STR_H
+    #define MESA_VK_ENUM_TO_STR_H
+
+    #include <vulkan/vulkan.h>
+    #include <vulkan/vk_android_native_buffer.h>
+
+    #ifdef __cplusplus
+    extern "C" {
+    #endif
+
+    % for ext in extensions:
+    #define _${ext.name}_number (${ext.number})
+    % endfor
+
+    % for enum in enums:
+      % if enum.guard:
+#ifdef ${enum.guard}
+      % endif
+    const char * vk_${enum.name[2:]}_to_str(${enum.name} input);
+      % if enum.guard:
+#endif
+      % endif
+    % endfor
+
+    size_t vk_structure_type_size(const struct VkBaseInStructure *item);
+
+    struct vk_instance_dispatch_table {
+        PFN_vkGetInstanceProcAddr GetInstanceProcAddr;
+    % for cmd in commands:
+        % if not cmd.device_entrypoint and cmd.name != 'vkGetInstanceProcAddr':
+            % if cmd.extension is not None and cmd.extension.define is not None:
+    #ifdef ${cmd.extension.define}
+        PFN_${cmd.name} ${cmd.name[2:]};
+    #endif
+            % else:
+        PFN_${cmd.name} ${cmd.name[2:]};
+            % endif
+        % endif
+    %endfor
+    };
+
+    struct vk_device_dispatch_table {
+        PFN_vkGetDeviceProcAddr GetDeviceProcAddr;
+    % for cmd in commands:
+        % if cmd.device_entrypoint and cmd.name != 'vkGetDeviceProcAddr':
+            % if cmd.extension is not None and cmd.extension.define is not None:
+    #ifdef ${cmd.extension.define}
+        PFN_${cmd.name} ${cmd.name[2:]};
+    #endif
+            % else:
+        PFN_${cmd.name} ${cmd.name[2:]};
+            % endif
+        % endif
+    %endfor
+    };
+
+    void vk_load_instance_commands(VkInstance instance, PFN_vkGetInstanceProcAddr gpa, struct vk_instance_dispatch_table *table);
+    void vk_load_device_commands(VkDevice device, PFN_vkGetDeviceProcAddr gpa, struct vk_device_dispatch_table *table);
+
+    #ifdef __cplusplus
+    } /* extern "C" */
+    #endif
+
+    #endif"""),
+    output_encoding='utf-8')
+
+
+class NamedFactory(object):
+    """Factory for creating enums."""
+
+    def __init__(self, type_):
+        self.registry = {}
+        self.type = type_
+
+    def __call__(self, name, **kwargs):
+        try:
+            return self.registry[name]
+        except KeyError:
+            n = self.registry[name] = self.type(name, **kwargs)
+        return n
+
+    def get(self, name):
+        return self.registry.get(name)
+
+
+class VkExtension(object):
+    """Simple struct-like class representing extensions"""
+
+    def __init__(self, name, number=None, define=None):
+        self.name = name
+        self.number = number
+        self.define = define
+
+
+class VkEnum(object):
+    """Simple struct-like class representing a single Vulkan Enum."""
+
+    def __init__(self, name, values=None):
+        self.name = name
+        self.extension = None
+        # Maps numbers to names
+        self.values = values or dict()
+        self.name_to_value = dict()
+        self.guard = None
+        self.name_to_alias_list = {}
+
+    def add_value(self, name, value=None,
+                  extnum=None, offset=None, alias=None,
+                  error=False):
+        if alias is not None:
+            assert value is None and offset is None
+            if alias not in self.name_to_value:
+                # We don't have this alias yet.  Just record the alias and
+                # we'll deal with it later.
+                alias_list = self.name_to_alias_list.get(alias, [])
+                alias_list.append(name);
+                return
+
+            # Use the value from the alias
+            value = self.name_to_value[alias]
+
+        assert value is not None or extnum is not None
+        if value is None:
+            value = 1000000000 + (extnum - 1) * 1000 + offset
+            if error:
+                value = -value
+
+        self.name_to_value[name] = value
+        if value not in self.values:
+            self.values[value] = name
+        elif len(self.values[value]) > len(name):
+            self.values[value] = name
+
+        # Now that the value has been fully added, resolve aliases, if any.
+        if name in self.name_to_alias_list:
+            for alias in self.name_to_alias_list[name]:
+                add_value(alias, value)
+            del self.name_to_alias_list[name]
+
+    def add_value_from_xml(self, elem, extension=None):
+        self.extension = extension
+        if 'value' in elem.attrib:
+            self.add_value(elem.attrib['name'],
+                           value=int(elem.attrib['value'], base=0))
+        elif 'alias' in elem.attrib:
+            self.add_value(elem.attrib['name'], alias=elem.attrib['alias'])
+        else:
+            error = 'dir' in elem.attrib and elem.attrib['dir'] == '-'
+            if 'extnumber' in elem.attrib:
+                extnum = int(elem.attrib['extnumber'])
+            else:
+                extnum = extension.number
+            self.add_value(elem.attrib['name'],
+                           extnum=extnum,
+                           offset=int(elem.attrib['offset']),
+                           error=error)
+
+    def set_guard(self, g):
+        self.guard = g
+
+
+class VkCommand(object):
+    """Simple struct-like class representing a single Vulkan command"""
+
+    def __init__(self, name, device_entrypoint=False):
+        self.name = name
+        self.device_entrypoint = device_entrypoint
+        self.extension = None
+
+
+class VkChainStruct(object):
+    """Simple struct-like class representing a single Vulkan struct identified with a VkStructureType"""
+    def __init__(self, name, stype):
+        self.name = name
+        self.stype = stype
+        self.extension = None
+
+
+def struct_get_stype(xml_node):
+    for member in xml_node.findall('./member'):
+        name = member.findall('./name')
+        if len(name) > 0 and name[0].text == "sType":
+            return member.get('values')
+    return None
+
+
+def parse_xml(cmd_factory, enum_factory, ext_factory, struct_factory, filename):
+    """Parse the XML file. Accumulate results into the factories.
+
+    This parser is a memory efficient iterative XML parser that returns a list
+    of VkEnum objects.
+    """
+
+    xml = et.parse(filename)
+
+    for enum_type in xml.findall('./enums[@type="enum"]'):
+        enum = enum_factory(enum_type.attrib['name'])
+        for value in enum_type.findall('./enum'):
+            enum.add_value_from_xml(value)
+
+    for value in xml.findall('./feature/require/enum[@extends]'):
+        enum = enum_factory.get(value.attrib['extends'])
+        if enum is not None:
+            enum.add_value_from_xml(value)
+
+    for command in xml.findall('./commands/command'):
+        name = command.find('./proto/name')
+        first_arg = command.find('./param/type')
+        # Some commands are alias KHR -> nonKHR, ignore those
+        if name is not None:
+            cmd_factory(name.text,
+                        device_entrypoint=(first_arg.text in ('VkDevice', 'VkCommandBuffer', 'VkQueue')))
+
+    for struct_type in xml.findall('./types/type[@category="struct"]'):
+        name = struct_type.attrib['name']
+        stype = struct_get_stype(struct_type)
+        if stype is not None:
+            struct_factory(name, stype=stype)
+
+    platform_define = {}
+    for platform in xml.findall('./platforms/platform'):
+        name = platform.attrib['name']
+        define = platform.attrib['protect']
+        platform_define[name] = define
+
+    for ext_elem in xml.findall('./extensions/extension[@supported="vulkan"]'):
+        define = None
+        if "platform" in ext_elem.attrib:
+            define = platform_define[ext_elem.attrib['platform']]
+        extension = ext_factory(ext_elem.attrib['name'],
+                                number=int(ext_elem.attrib['number']),
+                                define=define)
+
+        for value in ext_elem.findall('./require/enum[@extends]'):
+            enum = enum_factory.get(value.attrib['extends'])
+            if enum is not None:
+                enum.add_value_from_xml(value, extension)
+        for t in ext_elem.findall('./require/type'):
+            struct = struct_factory.get(t.attrib['name'])
+            if struct is not None:
+                struct.extension = extension
+
+        if define:
+            for value in ext_elem.findall('./require/type[@name]'):
+                enum = enum_factory.get(value.attrib['name'])
+                if enum is not None:
+                    enum.set_guard(define)
+
+        for t in ext_elem.findall('./require/command'):
+            command = cmd_factory.get(t.attrib['name'])
+            if command is not None:
+                command.extension = extension
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--xml', required=True,
+                        help='Vulkan API XML files',
+                        action='append',
+                        dest='xml_files')
+    parser.add_argument('--outdir',
+                        help='Directory to put the generated files in',
+                        required=True)
+
+    args = parser.parse_args()
+
+    command_factory = NamedFactory(VkCommand)
+    enum_factory = NamedFactory(VkEnum)
+    ext_factory = NamedFactory(VkExtension)
+    struct_factory = NamedFactory(VkChainStruct)
+    for filename in args.xml_files:
+        parse_xml(command_factory, enum_factory, ext_factory, struct_factory, filename)
+    commands = sorted(command_factory.registry.values(), key=lambda e: e.name)
+    enums = sorted(enum_factory.registry.values(), key=lambda e: e.name)
+    extensions = sorted(ext_factory.registry.values(), key=lambda e: e.name)
+    structs = sorted(struct_factory.registry.values(), key=lambda e: e.name)
+
+    for template, file_ in [(C_TEMPLATE, os.path.join(args.outdir, 'vk_enum_to_str.c')),
+                            (H_TEMPLATE, os.path.join(args.outdir, 'vk_enum_to_str.h'))]:
+        with open(file_, 'wb') as f:
+            f.write(template.render(
+                file=os.path.basename(__file__),
+                commands=commands,
+                enums=enums,
+                extensions=extensions,
+                structs=structs,
+                copyright=COPYRIGHT))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/bin/git_sha1_gen.py b/bin/git_sha1_gen.py
new file mode 100644
index 00000000..c6fbf490
--- /dev/null
+++ b/bin/git_sha1_gen.py
@@ -0,0 +1,50 @@
+"""
+Generate the contents of the git_sha1.h file.
+The output of this script goes to stdout.
+"""
+
+
+import argparse
+import os
+import os.path
+import subprocess
+import sys
+
+
+def get_git_sha1():
+    """Try to get the git SHA1 with git rev-parse."""
+    git_dir = os.path.join(os.path.dirname(sys.argv[0]), '..', '.git')
+    try:
+        git_sha1 = subprocess.check_output([
+            'git',
+            '--git-dir=' + git_dir,
+            'rev-parse',
+            'HEAD',
+        ], stderr=open(os.devnull, 'w')).decode("ascii")
+    except:
+        # don't print anything if it fails
+        git_sha1 = ''
+    return git_sha1
+
+def write_if_different(contents):
+    """
+    Avoid touching the output file if it doesn't need modifications
+    Useful to avoid triggering rebuilds when nothing has changed.
+    """
+    if os.path.isfile(args.output):
+        with open(args.output, 'r') as file:
+            if file.read() == contents:
+                return
+    with open(args.output, 'w') as file:
+        file.write(contents)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--output', help='File to write the #define in',
+                    required=True)
+args = parser.parse_args()
+
+git_sha1 = os.environ.get('MESA_GIT_SHA1_OVERRIDE', get_git_sha1())[:10]
+if git_sha1:
+    write_if_different('#define MESA_GIT_SHA1 " (git-' + git_sha1 + ')"')
+else:
+    write_if_different('#define MESA_GIT_SHA1 ""')
diff --git a/include/vulkan/vk_android_native_buffer.h b/include/vulkan/vk_android_native_buffer.h
new file mode 100644
index 00000000..8a777407
--- /dev/null
+++ b/include/vulkan/vk_android_native_buffer.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2015 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __VK_ANDROID_NATIVE_BUFFER_H__
+#define __VK_ANDROID_NATIVE_BUFFER_H__
+
+/* MESA: A hack to avoid #ifdefs in driver code. */
+#ifdef ANDROID
+#include <system/window.h>
+#include <cutils/native_handle.h>
+#include <vulkan/vulkan.h>
+#else
+typedef void *buffer_handle_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VK_ANDROID_native_buffer 1
+#define VK_ANDROID_NATIVE_BUFFER_EXTENSION_NUMBER 11
+
+/* NOTE ON VK_ANDROID_NATIVE_BUFFER_SPEC_VERSION 6
+ *
+ * This version of the extension transitions from gralloc0 to gralloc1 usage
+ * flags (int -> 2x uint64_t). The WSI implementation will temporarily continue
+ * to fill out deprecated fields in VkNativeBufferANDROID, and will call the
+ * deprecated vkGetSwapchainGrallocUsageANDROID if the new
+ * vkGetSwapchainGrallocUsage2ANDROID is not supported. This transitionary
+ * backwards-compatibility support is temporary, and will likely be removed in
+ * (along with all gralloc0 support) in a future release.
+ */
+#define VK_ANDROID_NATIVE_BUFFER_SPEC_VERSION     7
+#define VK_ANDROID_NATIVE_BUFFER_EXTENSION_NAME   "VK_ANDROID_native_buffer"
+
+#define VK_ANDROID_NATIVE_BUFFER_ENUM(type,id)    ((type)(1000000000 + (1000 * (VK_ANDROID_NATIVE_BUFFER_EXTENSION_NUMBER - 1)) + (id)))
+#define VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID   VK_ANDROID_NATIVE_BUFFER_ENUM(VkStructureType, 0)
+#define VK_STRUCTURE_TYPE_SWAPCHAIN_IMAGE_CREATE_INFO_ANDROID VK_ANDROID_NATIVE_BUFFER_ENUM(VkStructureType, 1)
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID VK_ANDROID_NATIVE_BUFFER_ENUM(VkStructureType, 2)
+
+typedef enum VkSwapchainImageUsageFlagBitsANDROID {
+    VK_SWAPCHAIN_IMAGE_USAGE_SHARED_BIT_ANDROID = 0x00000001,
+    VK_SWAPCHAIN_IMAGE_USAGE_FLAG_BITS_MAX_ENUM = 0x7FFFFFFF
+} VkSwapchainImageUsageFlagBitsANDROID;
+typedef VkFlags VkSwapchainImageUsageFlagsANDROID;
+typedef struct {
+    VkStructureType             sType; // must be VK_STRUCTURE_TYPE_NATIVE_BUFFER_ANDROID
+    const void*                 pNext;
+
+    // Buffer handle and stride returned from gralloc alloc()
+    buffer_handle_t             handle;
+    int                         stride;
+
+    // Gralloc format and usage requested when the buffer was allocated.
+    int                         format;
+    int                         usage; // DEPRECATED in SPEC_VERSION 6
+
+    // -- Added in SPEC_VERSION 6 --
+    struct {
+        uint64_t                consumer;
+        uint64_t                producer;
+    } usage2;
+} VkNativeBufferANDROID;
+
+typedef struct {
+    VkStructureType                        sType; // must be VK_STRUCTURE_TYPE_SWAPCHAIN_IMAGE_CREATE_INFO_ANDROID
+    const void*                            pNext;
+
+    VkSwapchainImageUsageFlagsANDROID      usage;
+} VkSwapchainImageCreateInfoANDROID;
+
+typedef struct {
+    VkStructureType                        sType; // must be VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PRESENTATION_PROPERTIES_ANDROID
+    const void*                            pNext;
+
+    VkBool32                               sharedImage;
+} VkPhysicalDevicePresentationPropertiesANDROID;
+
+// -- DEPRECATED in SPEC_VERSION 6 --
+typedef VkResult (VKAPI_PTR *PFN_vkGetSwapchainGrallocUsageANDROID)(VkDevice device, VkFormat format, VkImageUsageFlags imageUsage, int* grallocUsage);
+// -- ADDED in SPEC_VERSION 6 --
+typedef VkResult (VKAPI_PTR *PFN_vkGetSwapchainGrallocUsage2ANDROID)(VkDevice device, VkFormat format, VkImageUsageFlags imageUsage, VkSwapchainImageUsageFlagsANDROID swapchainImageUsage, uint64_t* grallocConsumerUsage, uint64_t* grallocProducerUsage);
+typedef VkResult (VKAPI_PTR *PFN_vkAcquireImageANDROID)(VkDevice device, VkImage image, int nativeFenceFd, VkSemaphore semaphore, VkFence fence);
+typedef VkResult (VKAPI_PTR *PFN_vkQueueSignalReleaseImageANDROID)(VkQueue queue, uint32_t waitSemaphoreCount, const VkSemaphore* pWaitSemaphores, VkImage image, int* pNativeFenceFd);
+
+#ifndef VK_NO_PROTOTYPES
+// -- DEPRECATED in SPEC_VERSION 6 --
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainGrallocUsageANDROID(
+    VkDevice            device,
+    VkFormat            format,
+    VkImageUsageFlags   imageUsage,
+    int*                grallocUsage
+);
+// -- ADDED in SPEC_VERSION 6 --
+VKAPI_ATTR VkResult VKAPI_CALL vkGetSwapchainGrallocUsage2ANDROID(
+    VkDevice            device,
+    VkFormat            format,
+    VkImageUsageFlags   imageUsage,
+    VkSwapchainImageUsageFlagsANDROID swapchainImageUsage,
+    uint64_t*           grallocConsumerUsage,
+    uint64_t*           grallocProducerUsage
+);
+VKAPI_ATTR VkResult VKAPI_CALL vkAcquireImageANDROID(
+    VkDevice            device,
+    VkImage             image,
+    int                 nativeFenceFd,
+    VkSemaphore         semaphore,
+    VkFence             fence
+);
+VKAPI_ATTR VkResult VKAPI_CALL vkQueueSignalReleaseImageANDROID(
+    VkQueue             queue,
+    uint32_t            waitSemaphoreCount,
+    const VkSemaphore*  pWaitSemaphores,
+    VkImage             image,
+    int*                pNativeFenceFd
+);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __VK_ANDROID_NATIVE_BUFFER_H__
diff --git a/include/vulkan/vk_util.h b/include/vulkan/vk_util.h
new file mode 100644
index 00000000..8ae384b9
--- /dev/null
+++ b/include/vulkan/vk_util.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright © 2017 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifndef VK_UTIL_H
+#define VK_UTIL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* common inlines and macros for vulkan drivers */
+
+#include <vulkan/vulkan.h>
+
+#define vk_foreach_struct(__iter, __start) \
+   for (struct VkBaseOutStructure *__iter = (struct VkBaseOutStructure *)(__start); \
+        __iter; __iter = __iter->pNext)
+
+#define vk_foreach_struct_const(__iter, __start) \
+   for (const struct VkBaseInStructure *__iter = (const struct VkBaseInStructure *)(__start); \
+        __iter; __iter = __iter->pNext)
+
+/**
+ * A wrapper for a Vulkan output array. A Vulkan output array is one that
+ * follows the convention of the parameters to
+ * vkGetPhysicalDeviceQueueFamilyProperties().
+ *
+ * Example Usage:
+ *
+ *    VkResult
+ *    vkGetPhysicalDeviceQueueFamilyProperties(
+ *       VkPhysicalDevice           physicalDevice,
+ *       uint32_t*                  pQueueFamilyPropertyCount,
+ *       VkQueueFamilyProperties*   pQueueFamilyProperties)
+ *    {
+ *       VK_OUTARRAY_MAKE(props, pQueueFamilyProperties,
+ *                         pQueueFamilyPropertyCount);
+ *
+ *       vk_outarray_append(&props, p) {
+ *          p->queueFlags = ...;
+ *          p->queueCount = ...;
+ *       }
+ *
+ *       vk_outarray_append(&props, p) {
+ *          p->queueFlags = ...;
+ *          p->queueCount = ...;
+ *       }
+ *
+ *       return vk_outarray_status(&props);
+ *    }
+ */
+struct __vk_outarray {
+   /** May be null. */
+   void *data;
+
+   /**
+    * Capacity, in number of elements. Capacity is unlimited (UINT32_MAX) if
+    * data is null.
+    */
+   uint32_t cap;
+
+   /**
+    * Count of elements successfully written to the array. Every write is
+    * considered successful if data is null.
+    */
+   uint32_t *filled_len;
+
+   /**
+    * Count of elements that would have been written to the array if its
+    * capacity were sufficient. Vulkan functions often return VK_INCOMPLETE
+    * when `*filled_len < wanted_len`.
+    */
+   uint32_t wanted_len;
+};
+
+static inline void
+__vk_outarray_init(struct __vk_outarray *a,
+                   void *data, uint32_t *restrict len)
+{
+   a->data = data;
+   a->cap = *len;
+   a->filled_len = len;
+   *a->filled_len = 0;
+   a->wanted_len = 0;
+
+   if (a->data == NULL)
+      a->cap = UINT32_MAX;
+}
+
+static inline VkResult
+__vk_outarray_status(const struct __vk_outarray *a)
+{
+   if (*a->filled_len < a->wanted_len)
+      return VK_INCOMPLETE;
+   else
+      return VK_SUCCESS;
+}
+
+static inline void *
+__vk_outarray_next(struct __vk_outarray *a, size_t elem_size)
+{
+   void *p = NULL;
+
+   a->wanted_len += 1;
+
+   if (*a->filled_len >= a->cap)
+      return NULL;
+
+   if (a->data != NULL)
+      p = (uint8_t *)a->data + (*a->filled_len) * elem_size;
+
+   *a->filled_len += 1;
+
+   return p;
+}
+
+#define vk_outarray(elem_t) \
+   struct { \
+      struct __vk_outarray base; \
+      elem_t meta[]; \
+   }
+
+#define vk_outarray_typeof_elem(a) __typeof__((a)->meta[0])
+#define vk_outarray_sizeof_elem(a) sizeof((a)->meta[0])
+
+#define vk_outarray_init(a, data, len) \
+   __vk_outarray_init(&(a)->base, (data), (len))
+
+#define VK_OUTARRAY_MAKE(name, data, len) \
+   vk_outarray(__typeof__((data)[0])) name; \
+   vk_outarray_init(&name, (data), (len))
+
+#define vk_outarray_status(a) \
+   __vk_outarray_status(&(a)->base)
+
+#define vk_outarray_next(a) \
+   ((vk_outarray_typeof_elem(a) *) \
+      __vk_outarray_next(&(a)->base, vk_outarray_sizeof_elem(a)))
+
+/**
+ * Append to a Vulkan output array.
+ *
+ * This is a block-based macro. For example:
+ *
+ *    vk_outarray_append(&a, elem) {
+ *       elem->foo = ...;
+ *       elem->bar = ...;
+ *    }
+ *
+ * The array `a` has type `vk_outarray(elem_t) *`. It is usually declared with
+ * VK_OUTARRAY_MAKE(). The variable `elem` is block-scoped and has type
+ * `elem_t *`.
+ *
+ * The macro unconditionally increments the array's `wanted_len`. If the array
+ * is not full, then the macro also increment its `filled_len` and then
+ * executes the block. When the block is executed, `elem` is non-null and
+ * points to the newly appended element.
+ */
+#define vk_outarray_append(a, elem) \
+   for (vk_outarray_typeof_elem(a) *elem = vk_outarray_next(a); \
+        elem != NULL; elem = NULL)
+
+static inline void *
+__vk_find_struct(void *start, VkStructureType sType)
+{
+   vk_foreach_struct(s, start) {
+      if (s->sType == sType)
+         return s;
+   }
+
+   return NULL;
+}
+
+#define vk_find_struct(__start, __sType) \
+   __vk_find_struct((__start), VK_STRUCTURE_TYPE_##__sType)
+
+#define vk_find_struct_const(__start, __sType) \
+   (const void *)__vk_find_struct((void *)(__start), VK_STRUCTURE_TYPE_##__sType)
+
+static inline void
+__vk_append_struct(void *start, void *element)
+{
+   vk_foreach_struct(s, start) {
+      if (s->pNext)
+         continue;
+
+      s->pNext = (struct VkBaseOutStructure *) element;
+      break;
+   }
+}
+
+uint32_t vk_get_driver_version(void);
+
+uint32_t vk_get_version_override(void);
+
+#define VK_EXT_OFFSET (1000000000UL)
+#define VK_ENUM_EXTENSION(__enum) \
+   ((__enum) >= VK_EXT_OFFSET ? ((((__enum) - VK_EXT_OFFSET) / 1000UL) + 1) : 0)
+#define VK_ENUM_OFFSET(__enum) \
+   ((__enum) >= VK_EXT_OFFSET ? ((__enum) % 1000) : (__enum))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VK_UTIL_H */
diff --git a/meson.build b/meson.build
new file mode 100644
index 00000000..789fb58d
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,260 @@
+# Copyright © 2019 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+project('mangohud',
+  ['c', 'cpp'],
+  version : 'v1.0.0',
+  license : 'MIT',
+  meson_version : '>= 0.46',
+  default_options : ['buildtype=release', 'b_ndebug=if-release', 'c_std=c99', 'cpp_std=c++14']
+)
+
+cc = meson.get_compiler('c')
+cpp = meson.get_compiler('cpp')
+prog_python = import('python').find_installation('python3')
+
+pre_args = [
+  '-D__STDC_CONSTANT_MACROS',
+  '-D__STDC_FORMAT_MACROS',
+  '-D__STDC_LIMIT_MACROS',
+  '-DPACKAGE_VERSION="@0@"'.format(meson.project_version()),
+]
+
+# Define DEBUG for debug builds only (debugoptimized is not included on this one)
+if get_option('buildtype') == 'debug'
+  pre_args += '-DDEBUG'
+endif
+
+# TODO: this is very incomplete
+if ['linux', 'cygwin', 'gnu'].contains(host_machine.system())
+  pre_args += '-D_GNU_SOURCE'
+  pre_args += '-DHAVE_PTHREAD'
+endif
+
+# Check for GCC style atomics
+if cc.compiles('''#include <stdint.h>
+                  int main() {
+                    struct {
+                      uint64_t *v;
+                    } x;
+                    return (int)__atomic_load_n(x.v, __ATOMIC_ACQUIRE) &
+                           (int)__atomic_add_fetch(x.v, (uint64_t)1, __ATOMIC_ACQ_REL);
+
+                  }''',
+               name : 'GCC atomic builtins')
+  pre_args += '-DUSE_GCC_ATOMIC_BUILTINS'
+endif
+
+# Not in C99, needs POSIX
+if cc.compiles('''
+                 #define _GNU_SOURCE
+                 #include <time.h>
+                  int main() {
+                    struct timespec ts;
+                    return timespec_get(&ts, TIME_UTC);
+
+                  }''',
+               name : 'Supports timespec_get')
+  pre_args += '-DHAVE_TIMESPEC_GET'
+endif
+
+# Check for GCC style builtins
+foreach b : ['bswap32', 'bswap64', 'clz', 'clzll', 'ctz', 'expect', 'ffs',
+             'ffsll', 'popcount', 'popcountll', 'unreachable']
+  if cc.has_function(b)
+    pre_args += '-DHAVE___BUILTIN_@0@'.format(b.to_upper())
+  endif
+endforeach
+
+null_dep = dependency('', required : false)
+
+vulkan_wsi_args = []
+vulkan_wsi_deps = []
+
+with_platform_x11 = true
+with_platform_wayland = false
+with_xlib_lease = true
+
+dep_x11 = dependency('x11')
+dep_xext = dependency('xext')
+dep_xcb = dependency('xcb')
+dep_x11_xcb = dependency('x11-xcb')
+dep_xcb_dri2 = dependency('xcb-dri2', version : '>= 1.8')
+dep_libdrm = dependency(
+  'libdrm', version : '>=' + '2.4.81',
+  required : true
+)
+
+pre_args += '-DHAVE_DRI3'
+dep_xcb_dri3 = dependency('xcb-dri3')
+dep_xcb_present = dependency('xcb-present')
+# until xcb-dri3 has been around long enough to make a hard-dependency:
+if (dep_xcb_dri3.version().version_compare('>= 1.13') and
+  dep_xcb_present.version().version_compare('>= 1.13'))
+pre_args += '-DHAVE_DRI3_MODIFIERS'
+endif
+dep_xcb_sync = dependency('xcb-sync')
+dep_xshmfence = dependency('xshmfence', version : '>= 1.1')
+
+if with_platform_x11
+  vulkan_wsi_args += ['-DVK_USE_PLATFORM_XCB_KHR', '-DVK_USE_PLATFORM_XLIB_KHR']
+  vulkan_wsi_deps += [
+    dep_xcb,
+    dep_x11_xcb,
+    dep_xcb_dri2,
+    dep_xcb_dri3,
+    dep_xcb_present,
+    dep_xcb_sync,
+    dep_xshmfence,
+  ]
+endif
+if with_platform_wayland
+  dep_wayland_client = dependency('wayland-client', version : '>=1.11')
+  vulkan_wsi_args += ['-DVK_USE_PLATFORM_WAYLAND_KHR']
+  vulkan_wsi_deps += dep_wayland_client
+endif
+
+vulkan_wsi_args += '-DVK_USE_PLATFORM_DISPLAY_KHR'
+vulkan_wsi_deps += [dep_libdrm]
+
+if with_xlib_lease
+  dep_xcb_xrandr = dependency('xcb-randr')
+  dep_xlib_xrandr = dependency('xrandr', version : '>= 1.3')
+  vulkan_wsi_args += '-DVK_USE_PLATFORM_XLIB_XRANDR_EXT'
+  vulkan_wsi_deps += [dep_xcb_xrandr, dep_xlib_xrandr]
+endif
+
+inc_common = [
+    include_directories('include'),
+]
+
+# Check for generic C arguments
+c_args = []
+foreach a : ['-Werror=implicit-function-declaration',
+             '-Werror=missing-prototypes', '-Werror=return-type',
+             '-Werror=incompatible-pointer-types',
+             '-fno-math-errno',
+             '-fno-trapping-math', '-Qunused-arguments']
+  if cc.has_argument(a)
+    c_args += a
+  endif
+endforeach
+
+foreach a : ['missing-field-initializers', 'format-truncation']
+  if cc.has_argument('-W' + a)
+    c_args += '-Wno-' + a
+  endif
+endforeach
+
+c_vis_args = []
+if cc.has_argument('-fvisibility=hidden')
+  c_vis_args += '-fvisibility=hidden'
+endif
+
+# Check for generic C++ arguments
+cpp_args = []
+foreach a : ['-Werror=return-type',
+             '-fno-math-errno', '-fno-trapping-math',
+             '-Qunused-arguments']
+  if cpp.has_argument(a)
+    cpp_args += a
+  endif
+endforeach
+
+# For some reason, the test for -Wno-foo always succeeds with gcc, even if the
+# option is not supported. Hence, check for -Wfoo instead.
+
+foreach a : ['non-virtual-dtor', 'missing-field-initializers', 'format-truncation']
+  if cpp.has_argument('-W' + a)
+    cpp_args += '-Wno-' + a
+  endif
+endforeach
+
+no_override_init_args = []
+foreach a : ['override-init', 'initializer-overrides']
+  if cc.has_argument('-W' + a)
+    no_override_init_args += '-Wno-' + a
+  endif
+endforeach
+
+cpp_vis_args = []
+if cpp.has_argument('-fvisibility=hidden')
+  cpp_vis_args += '-fvisibility=hidden'
+endif
+
+foreach a : pre_args
+  add_project_arguments(a, language : ['c', 'cpp'])
+endforeach
+foreach a : c_args
+  add_project_arguments(a, language : ['c'])
+endforeach
+foreach a : cpp_args
+  add_project_arguments(a, language : ['cpp'])
+endforeach
+
+# check for dl support
+if cc.has_function('dlopen')
+  dep_dl = null_dep
+else
+  dep_dl = cc.find_library('dl')
+endif
+
+dep_pthread = cc.find_library('pthread')
+
+git_sha1_gen_py = files('bin/git_sha1_gen.py')
+sha1_h = custom_target(
+  'git_sha1.h',
+  output : 'git_sha1.h',
+  command : [prog_python, git_sha1_gen_py, '--output', '@OUTPUT@'],
+  build_always : true, # commit sha1 can change without having touched these files
+)
+
+vk_layer_table_helpers = []
+loader_genvk_py = files('modules/Vulkan-Loader/scripts/loader_genvk.py')
+foreach s : ['vk_dispatch_table_helper.h', 'vk_layer_dispatch_table.h']#, 'vk_loader_extensions.h', 'vk_loader_extensions.c']
+  vk_layer_table_helpers += custom_target(
+    s, output : s,
+    command : [prog_python, loader_genvk_py,
+      '-scripts', '../../Vulkan-Docs/scripts', # relative to loader_genvk.py
+      '-registry', join_paths(meson.source_root(), 'modules/Vulkan-Docs/xml/vk.xml'),
+      '-o','@OUTDIR@', s])
+endforeach
+
+vk_api_xml = files('modules/Vulkan-Docs/xml/vk.xml')
+vk_enum_to_str = custom_target(
+  'vk_enum_to_str',
+  input : ['bin/gen_enum_to_str.py', vk_api_xml],
+  output : ['vk_enum_to_str.c', 'vk_enum_to_str.h'],
+  command : [
+    prog_python, '@INPUT0@', '--xml', '@INPUT1@',
+    '--outdir', meson.current_build_dir()
+  ],
+)
+
+util_files = files(
+  'src/mesa/util/hash_table.c',
+  'src/mesa/util/os_socket.c',
+  'src/mesa/util/os_time.c',
+  'src/mesa/util/ralloc.c',
+  'src/mesa/main/hash.c',
+)
+
+subdir('modules/ImGui')
+subdir('src')
\ No newline at end of file
diff --git a/src/cpu_gpu.h b/src/cpu_gpu.h
new file mode 100644
index 00000000..26e4c72f
--- /dev/null
+++ b/src/cpu_gpu.h
@@ -0,0 +1,265 @@
+#include <cmath>
+#include <iomanip>
+#include <array>
+#include <vector>
+#include <algorithm>
+#include <iterator>
+#include <thread>
+#include <sstream>
+#include <fstream>
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <string>
+#include <sstream>
+#include <regex>
+using namespace std;
+
+int gpuLoad, gpuTemp, cpuTemp;
+string gpuLoadDisplay, cpuTempLocation;
+FILE *amdGpuFile, *amdTempFile, *cpuTempFile;
+
+const int NUM_CPU_STATES = 10;
+
+struct Cpus{
+  size_t num;
+  string name;
+  int value;
+  string output;
+  int freq;
+};
+
+size_t numCpuCores = std::thread::hardware_concurrency();
+size_t arraySize = numCpuCores + 1;
+std::vector<Cpus> cpuArray;
+pthread_t cpuThread, gpuThread, cpuInfoThread, nvidiaSmiThread;
+
+string exec(string command) {
+   char buffer[128];
+   string result = "";
+
+   // Open pipe to file
+   FILE* pipe = popen(command.c_str(), "r");
+   if (!pipe) {
+      return "popen failed!";
+   }
+
+   // read till end of process:
+   while (!feof(pipe)) {
+
+      // use buffer to read and add to result
+      if (fgets(buffer, 128, pipe) != NULL)
+         result += buffer;
+   }
+
+   pclose(pipe);
+   return result;
+}
+
+void coreCounting(){
+  cpuArray.push_back({0, "CPU:"});
+  for (size_t i = 0; i < arraySize; i++) {
+    size_t offset = i;
+    stringstream ss;
+    ss << "CPU " << offset << ":";
+    string cpuNameString = ss.str();
+    cpuArray.push_back({i+1 , cpuNameString});
+  }
+}
+
+std::string m_cpuUtilizationString;
+
+enum CPUStates
+{
+	S_USER = 0,
+	S_NICE,
+	S_SYSTEM,
+	S_IDLE,
+	S_IOWAIT,
+	S_IRQ,
+	S_SOFTIRQ,
+	S_STEAL,
+	S_GUEST,
+	S_GUEST_NICE
+};
+
+typedef struct CPUData
+{
+	std::string cpu;
+	size_t times[NUM_CPU_STATES];
+} CPUData;
+
+void ReadStatsCPU(std::vector<CPUData> & entries)
+{
+	std::ifstream fileStat("/proc/stat");
+
+	std::string line;
+
+	const std::string STR_CPU("cpu");
+	const std::size_t LEN_STR_CPU = STR_CPU.size();
+	const std::string STR_TOT("tot");
+
+	while(std::getline(fileStat, line))
+	{
+		// cpu stats line found
+		if(!line.compare(0, LEN_STR_CPU, STR_CPU))
+		{
+			std::istringstream ss(line);
+
+			// store entry
+			entries.emplace_back(CPUData());
+			CPUData & entry = entries.back();
+
+			// read cpu label
+			ss >> entry.cpu;
+
+			if(entry.cpu.size() > LEN_STR_CPU)
+				entry.cpu.erase(0, LEN_STR_CPU);
+			else
+				entry.cpu = STR_TOT;
+
+			// read times
+			for(int i = 0; i < NUM_CPU_STATES; ++i)
+				ss >> entry.times[i];
+		}
+	}
+}
+
+size_t GetIdleTime(const CPUData & e)
+{
+	return	e.times[S_IDLE] +
+			e.times[S_IOWAIT];
+}
+
+size_t GetActiveTime(const CPUData & e)
+{
+	return	e.times[S_USER] +
+			e.times[S_NICE] +
+			e.times[S_SYSTEM] +
+			e.times[S_IRQ] +
+			e.times[S_SOFTIRQ] +
+			e.times[S_STEAL] +
+			e.times[S_GUEST] +
+			e.times[S_GUEST_NICE];
+}
+
+void PrintStats(const std::vector<CPUData> & entries1, const std::vector<CPUData> & entries2)
+{
+	const size_t NUM_ENTRIES = entries1.size();
+
+	for(size_t i = 0; i < NUM_ENTRIES; ++i)
+	{
+		const CPUData & e1 = entries1[i];
+		const CPUData & e2 = entries2[i];
+
+		const float ACTIVE_TIME	= static_cast<float>(GetActiveTime(e2) - GetActiveTime(e1));
+		const float IDLE_TIME	= static_cast<float>(GetIdleTime(e2) - GetIdleTime(e1));
+		const float TOTAL_TIME	= ACTIVE_TIME + IDLE_TIME;
+
+    cpuArray[i].value = (truncf(100.f * ACTIVE_TIME / TOTAL_TIME) * 10 / 10);
+	}
+}
+
+void *cpuInfo(void *){
+	FILE *cpuInfo = fopen("/proc/cpuinfo", "r");
+    char line[256];
+	int i = 0;
+    while (fgets(line, sizeof(line), cpuInfo)) {
+		std::string row;
+		row = line;
+		if (row.find("MHz") != std::string::npos){
+			row = std::regex_replace(row, std::regex(R"([^0-9.])"), "");
+			cpuArray[i + 1].freq = stoi(row);
+			i++;
+		}
+    }
+
+    fclose(cpuInfo);
+
+	char buff[6];
+	rewind(cpuTempFile);
+    fflush(cpuTempFile);
+   	fscanf(cpuTempFile, "%s", buff);
+	cpuTemp = stoi(buff) / 1000;
+	pthread_detach(cpuInfoThread);
+	
+	return NULL;
+}
+
+void *queryNvidiaSmi(void *){
+	vector<string> smiArray;
+	string nvidiaSmi = exec("nvidia-smi --query-gpu=utilization.gpu,temperature.gpu --format=csv,noheader | tr -d ' ' | head -n1 | tr -d '%'");
+	istringstream f(nvidiaSmi);
+	string s;
+	while (getline(f, s, ',')) {
+        smiArray.push_back(s);
+    }
+	gpuLoadDisplay = smiArray[0];
+	gpuLoad = stoi(smiArray[0]);
+	gpuTemp = stoi(smiArray[1]);
+	
+	pthread_detach(nvidiaSmiThread);
+	return NULL;
+}
+
+void *getAmdGpuUsage(void *){
+	char buff[5];
+	rewind(amdGpuFile);
+    fflush(amdGpuFile);
+   	fscanf(amdGpuFile, "%s", buff);
+	gpuLoadDisplay = buff;
+	gpuLoad = stoi(buff);	
+	
+	rewind(amdTempFile);
+    fflush(amdTempFile);
+	fscanf(amdTempFile, "%s", buff);
+	gpuTemp = (stoi(buff) / 1000);
+
+	pthread_detach(gpuThread);
+	return NULL;
+}
+
+void *getCpuUsage(void *)
+{
+	std::vector<CPUData> entries1;
+	std::vector<CPUData> entries2;
+
+	// snapshot 1
+	ReadStatsCPU(entries1);
+
+	// 100ms pause
+	std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+	// snapshot 2
+	ReadStatsCPU(entries2);
+
+	// print output
+	PrintStats(entries1, entries2);
+	pthread_detach(cpuThread);
+	return NULL;
+}
+
+
+void updateCpuStrings(){
+  for (size_t i = 0; i < arraySize; i++) {
+    size_t spacing = 10;
+    string value = to_string(cpuArray[i].value);
+    value.erase( value.find_last_not_of('0') + 1, std::string::npos );
+    size_t correctionValue = (spacing - cpuArray[i].name.length()) - value.length();
+    string correction = "";
+    for (size_t i = 0; i < correctionValue; i++) {
+          correction.append(" ");
+        }
+        stringstream ss;
+        if (i < 11) {
+          if (i == 0) {
+            ss << cpuArray[i].name << " " << cpuArray[i].value << "%";
+          } else {
+            ss << cpuArray[i].name << correction << cpuArray[i].value << "%";
+          }
+        } else {
+          ss << cpuArray[i].name << correction << cpuArray[i].value << "%";
+        }
+        cpuArray[i].output = ss.str();
+      }
+    }
\ No newline at end of file
diff --git a/src/keybinds.h b/src/keybinds.h
new file mode 100644
index 00000000..2fa403fc
--- /dev/null
+++ b/src/keybinds.h
@@ -0,0 +1,18 @@
+#include <X11/Xlib.h>
+#include <iostream>
+#include "X11/keysym.h"
+#include "mesa/util/os_time.h"
+
+double elapsedF2, elapsedF12;
+uint64_t last_f2_press, last_f12_press;
+pthread_t f2;
+char *displayid = getenv("DISPLAY");
+Display *dpy = XOpenDisplay(displayid);
+
+bool key_is_pressed(KeySym ks) {
+    char keys_return[32];
+    XQueryKeymap(dpy, keys_return);
+    KeyCode kc2 = XKeysymToKeycode(dpy, ks);
+    bool isPressed = !!(keys_return[kc2 >> 3] & (1 << (kc2 & 7)));
+    return isPressed;
+}
diff --git a/src/logging.h b/src/logging.h
new file mode 100644
index 00000000..f243df4a
--- /dev/null
+++ b/src/logging.h
@@ -0,0 +1,64 @@
+#include <iostream>
+#include <vector>
+#include <fstream>
+#include <chrono>
+#include <thread>
+
+#include "mesa/util/os_time.h"
+
+using namespace std;
+
+string os, cpu, gpu, ram, kernel, driver, deviceName;
+bool sysInfoFetched; 
+int gpuLoadLog,cpuLoadLog,log_period;
+
+struct logData{
+  double fps;
+  double cpu;
+  double gpu;
+  double previous;
+};
+
+double fps, elapsedLog;
+std::vector<logData> logArray;
+ofstream out;
+const char* duration_env = std::getenv("LOG_DURATION");
+const char* mangohud_output_env = std::getenv("MANGOHUD_OUTPUT");
+const char* log_period_env = std::getenv("LOG_PERIOD");
+int duration, num;
+bool loggingOn;
+uint64_t log_start;
+
+void writeFile(string date){
+	out.open(mangohud_output_env + date, ios::out | ios::app);
+  out << "os," << "cpu," << "gpu," << "ram," << "kernel," << "driver" << endl;
+  out << os << "," << cpu << "," << gpu << "," << ram << "," << kernel << "," << driver << endl;
+	for (size_t i = 0; i < logArray.size(); i++) {
+     out << logArray[i].fps << "," << logArray[i].cpu  << "," << logArray[i].gpu << endl;
+  }
+	out.close();
+	logArray.clear();
+}
+
+void *logging(void *){
+  time_t now_log = time(0);
+  tm *log_time = localtime(&now_log);
+	string date = to_string(log_time->tm_year + 1900) + "-" + to_string(1 + log_time->tm_mon) + "-" + to_string(log_time->tm_mday) + "_" + to_string(1 + log_time->tm_hour) + "-" + to_string(1 + log_time->tm_min) + "-" + to_string(1 + log_time->tm_sec);
+  log_start = os_time_get();
+  out.open(mangohud_output_env + date, ios::out | ios::app);
+
+	while (loggingOn){
+    uint64_t now = os_time_get();
+    elapsedLog = (double)(now - log_start);
+    out << fps << "," << cpuLoadLog << "," << gpuLoadLog << "," <<  now - log_start << endl;
+		// logArray.push_back({fps, cpuLoadLog, gpuLoadLog, 0.0f});
+
+    if ((elapsedLog) >= duration * 1000000 && duration_env)
+      loggingOn = false;
+    
+    this_thread::sleep_for(chrono::milliseconds(log_period));
+  }
+  // writeFile(date);
+  out.close();
+  return NULL; 
+}
\ No newline at end of file
diff --git a/src/mesa-overlay-control.py b/src/mesa-overlay-control.py
new file mode 100755
index 00000000..6947250c
--- /dev/null
+++ b/src/mesa-overlay-control.py
@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+import os
+import socket
+import sys
+import select
+from select import EPOLLIN, EPOLLPRI, EPOLLERR
+import time
+from collections import namedtuple
+import argparse
+
+TIMEOUT = 1.0 # seconds
+
+VERSION_HEADER = bytearray('MesaOverlayControlVersion', 'utf-8')
+DEVICE_NAME_HEADER = bytearray('DeviceName', 'utf-8')
+MESA_VERSION_HEADER = bytearray('MesaVersion', 'utf-8')
+
+DEFAULT_SERVER_ADDRESS = "\0mesa_overlay"
+
+class Connection:
+    def __init__(self, path):
+        # Create a Unix Domain socket and connect
+        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        try:
+            sock.connect(path)
+        except socket.error as msg:
+            print(msg)
+            sys.exit(1)
+
+        self.sock = sock
+
+        # initialize poll interface and register socket
+        epoll = select.epoll()
+        epoll.register(sock, EPOLLIN | EPOLLPRI | EPOLLERR)
+        self.epoll = epoll
+
+    def recv(self, timeout):
+        '''
+        timeout as float in seconds
+        returns:
+            - None on error or disconnection
+            - bytes() (empty) on timeout
+        '''
+
+        events = self.epoll.poll(timeout)
+        for ev in events:
+            (fd, event) = ev
+            if fd != self.sock.fileno():
+                continue
+
+            # check for socket error
+            if event & EPOLLERR:
+                return None
+
+            # EPOLLIN or EPOLLPRI, just read the message
+            msg = self.sock.recv(4096)
+
+            # socket disconnected
+            if len(msg) == 0:
+                return None
+
+            return msg
+
+        return bytes()
+
+    def send(self, msg):
+        self.sock.send(msg)
+
+class MsgParser:
+    MSGBEGIN = bytes(':', 'utf-8')[0]
+    MSGEND = bytes(';', 'utf-8')[0]
+    MSGSEP = bytes('=', 'utf-8')[0]
+
+    def __init__(self, conn):
+        self.cmdpos = 0
+        self.parampos = 0
+        self.bufferpos = 0
+        self.reading_cmd = False
+        self.reading_param = False
+        self.buffer = None
+        self.cmd = bytearray(4096)
+        self.param = bytearray(4096)
+
+        self.conn = conn
+
+    def readCmd(self, ncmds, timeout=TIMEOUT):
+        '''
+        returns:
+            - None on error or disconnection
+            - bytes() (empty) on timeout
+        '''
+
+        parsed = []
+
+        remaining = timeout
+
+        while remaining > 0 and ncmds > 0:
+            now = time.monotonic()
+
+            if self.buffer == None:
+                self.buffer = self.conn.recv(remaining)
+                self.bufferpos = 0
+
+            # disconnected or error
+            if self.buffer == None:
+                return None
+
+            for i in range(self.bufferpos, len(self.buffer)):
+                c = self.buffer[i]
+                self.bufferpos += 1
+                if c == self.MSGBEGIN:
+                    self.cmdpos = 0
+                    self.parampos = 0
+                    self.reading_cmd = True
+                    self.reading_param = False
+                elif c == self.MSGEND:
+                    if not self.reading_cmd:
+                        continue
+                    self.reading_cmd = False
+                    self.reading_param = False
+
+                    cmd = self.cmd[0:self.cmdpos]
+                    param = self.param[0:self.parampos]
+                    self.reading_cmd = False
+                    self.reading_param = False
+
+                    parsed.append((cmd, param))
+                    ncmds -= 1
+                    if ncmds == 0:
+                        break
+                elif c == self.MSGSEP:
+                    if self.reading_cmd:
+                        self.reading_param = True
+                else:
+                    if self.reading_param:
+                        self.param[self.parampos] = c
+                        self.parampos += 1
+                    elif self.reading_cmd:
+                        self.cmd[self.cmdpos] = c
+                        self.cmdpos += 1
+
+            # if we read the entire buffer and didn't finish the command,
+            # throw it away
+            self.buffer = None
+
+            # check if we have time for another iteration
+            elapsed = time.monotonic() - now
+            remaining = max(0, remaining - elapsed)
+
+        # timeout
+        return parsed
+
+def control(args):
+    if args.socket:
+        address = '\0' + args.socket
+    else:
+        address = DEFAULT_SERVER_ADDRESS
+
+    conn = Connection(address)
+    msgparser = MsgParser(conn)
+
+    version = None
+    name = None
+    mesa_version = None
+
+    msgs = msgparser.readCmd(3)
+
+    for m in msgs:
+        cmd, param = m
+        if cmd == VERSION_HEADER:
+            version = int(param)
+        elif cmd == DEVICE_NAME_HEADER:
+            name = param.decode('utf-8')
+        elif cmd == MESA_VERSION_HEADER:
+            mesa_version = param.decode('utf-8')
+
+    if version != 1 or name == None or mesa_version == None:
+        print('ERROR: invalid protocol')
+        sys.exit(1)
+
+
+    if args.info:
+        info = "Protocol Version: {}\n"
+        info += "Device Name: {}\n"
+        info += "Mesa Version: {}"
+        print(info.format(version, name, mesa_version))
+
+    if args.cmd == 'start-capture':
+        conn.send(bytearray(':capture=1;', 'utf-8'))
+    elif args.cmd == 'stop-capture':
+        conn.send(bytearray(':capture=0;', 'utf-8'))
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='MESA_overlay control client')
+    parser.add_argument('--info', action='store_true', help='Print info from socket')
+    parser.add_argument('--socket', '-s', type=str, help='Path to socket')
+
+    commands = parser.add_subparsers(help='commands to run', dest='cmd')
+    commands.add_parser('start-capture')
+    commands.add_parser('stop-capture')
+
+    args = parser.parse_args()
+
+    control(args)
diff --git a/src/mesa/c11/threads.h b/src/mesa/c11/threads.h
new file mode 100644
index 00000000..17ded84b
--- /dev/null
+++ b/src/mesa/c11/threads.h
@@ -0,0 +1,73 @@
+/*
+ * C11 <threads.h> emulation library
+ *
+ * (C) Copyright yohhoy 2012.
+ * Distributed under the Boost Software License, Version 1.0.
+ *
+ * Permission is hereby granted, free of charge, to any person or organization
+ * obtaining a copy of the software and accompanying documentation covered by
+ * this license (the "Software") to use, reproduce, display, distribute,
+ * execute, and transmit the Software, and to prepare [[derivative work]]s of the
+ * Software, and to permit third-parties to whom the Software is furnished to
+ * do so, all subject to the following:
+ *
+ * The copyright notices in the Software and this entire statement, including
+ * the above license grant, this restriction and the following disclaimer,
+ * must be included in all copies of the Software, in whole or in part, and
+ * all derivative works of the Software, unless such copies or derivative
+ * works are solely in the form of machine-executable object code generated by
+ * a source language processor.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef EMULATED_THREADS_H_INCLUDED_
+#define EMULATED_THREADS_H_INCLUDED_
+
+#include <time.h>
+
+#ifndef TIME_UTC
+#define TIME_UTC 1
+#endif
+
+#include "../c99_compat.h" /* for `inline` */
+
+/*---------------------------- types ----------------------------*/
+typedef void (*tss_dtor_t)(void*);
+typedef int (*thrd_start_t)(void*);
+
+
+/*-------------------- enumeration constants --------------------*/
+enum {
+    mtx_plain     = 0,
+    mtx_try       = 1,
+    mtx_timed     = 2,
+    mtx_recursive = 4
+};
+
+enum {
+    thrd_success = 0, // succeeded
+    thrd_timeout,     // timeout
+    thrd_error,       // failed
+    thrd_busy,        // resource busy
+    thrd_nomem        // out of memory
+};
+
+/*-------------------------- functions --------------------------*/
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#include "threads_win32.h"
+#elif defined(HAVE_PTHREAD)
+#include "threads_posix.h"
+#else
+#error Not supported on this platform.
+#endif
+
+
+
+#endif /* EMULATED_THREADS_H_INCLUDED_ */
diff --git a/src/mesa/c11/threads_posix.h b/src/mesa/c11/threads_posix.h
new file mode 100644
index 00000000..45cb6075
--- /dev/null
+++ b/src/mesa/c11/threads_posix.h
@@ -0,0 +1,396 @@
+/*
+ * C11 <threads.h> emulation library
+ *
+ * (C) Copyright yohhoy 2012.
+ * Distributed under the Boost Software License, Version 1.0.
+ *
+ * Permission is hereby granted, free of charge, to any person or organization
+ * obtaining a copy of the software and accompanying documentation covered by
+ * this license (the "Software") to use, reproduce, display, distribute,
+ * execute, and transmit the Software, and to prepare [[derivative work]]s of the
+ * Software, and to permit third-parties to whom the Software is furnished to
+ * do so, all subject to the following:
+ *
+ * The copyright notices in the Software and this entire statement, including
+ * the above license grant, this restriction and the following disclaimer,
+ * must be included in all copies of the Software, in whole or in part, and
+ * all derivative works of the Software, unless such copies or derivative
+ * works are solely in the form of machine-executable object code generated by
+ * a source language processor.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#include <stdlib.h>
+#ifndef assert
+#include <assert.h>
+#endif
+#include <limits.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sched.h>
+#include <stdint.h> /* for intptr_t */
+
+/*
+Configuration macro:
+
+  EMULATED_THREADS_USE_NATIVE_TIMEDLOCK
+    Use pthread_mutex_timedlock() for `mtx_timedlock()'
+    Otherwise use mtx_trylock() + *busy loop* emulation.
+*/
+#if !defined(__CYGWIN__) && !defined(__APPLE__) && !defined(__NetBSD__)
+#define EMULATED_THREADS_USE_NATIVE_TIMEDLOCK
+#endif
+
+
+#include <pthread.h>
+
+/*---------------------------- macros ----------------------------*/
+#define ONCE_FLAG_INIT PTHREAD_ONCE_INIT
+#ifdef INIT_ONCE_STATIC_INIT
+#define TSS_DTOR_ITERATIONS PTHREAD_DESTRUCTOR_ITERATIONS
+#else
+#define TSS_DTOR_ITERATIONS 1  // assume TSS dtor MAY be called at least once.
+#endif
+
+// FIXME: temporary non-standard hack to ease transition
+#define _MTX_INITIALIZER_NP PTHREAD_MUTEX_INITIALIZER
+
+/*---------------------------- types ----------------------------*/
+typedef pthread_cond_t  cnd_t;
+typedef pthread_t       thrd_t;
+typedef pthread_key_t   tss_t;
+typedef pthread_mutex_t mtx_t;
+typedef pthread_once_t  once_flag;
+
+
+/*
+Implementation limits:
+  - Conditionally emulation for "mutex with timeout"
+    (see EMULATED_THREADS_USE_NATIVE_TIMEDLOCK macro)
+*/
+struct impl_thrd_param {
+    thrd_start_t func;
+    void *arg;
+};
+
+static inline void *
+impl_thrd_routine(void *p)
+{
+    struct impl_thrd_param pack = *((struct impl_thrd_param *)p);
+    free(p);
+    return (void*)(intptr_t)pack.func(pack.arg);
+}
+
+
+/*--------------- 7.25.2 Initialization functions ---------------*/
+// 7.25.2.1
+static inline void
+call_once(once_flag *flag, void (*func)(void))
+{
+    pthread_once(flag, func);
+}
+
+
+/*------------- 7.25.3 Condition variable functions -------------*/
+// 7.25.3.1
+static inline int
+cnd_broadcast(cnd_t *cond)
+{
+    assert(cond != NULL);
+    return (pthread_cond_broadcast(cond) == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.3.2
+static inline void
+cnd_destroy(cnd_t *cond)
+{
+    assert(cond);
+    pthread_cond_destroy(cond);
+}
+
+// 7.25.3.3
+static inline int
+cnd_init(cnd_t *cond)
+{
+    assert(cond != NULL);
+    return (pthread_cond_init(cond, NULL) == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.3.4
+static inline int
+cnd_signal(cnd_t *cond)
+{
+    assert(cond != NULL);
+    return (pthread_cond_signal(cond) == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.3.5
+static inline int
+cnd_timedwait(cnd_t *cond, mtx_t *mtx, const struct timespec *abs_time)
+{
+    int rt;
+
+    assert(mtx != NULL);
+    assert(cond != NULL);
+    assert(abs_time != NULL);
+
+    rt = pthread_cond_timedwait(cond, mtx, abs_time);
+    if (rt == ETIMEDOUT)
+        return thrd_busy;
+    return (rt == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.3.6
+static inline int
+cnd_wait(cnd_t *cond, mtx_t *mtx)
+{
+    assert(mtx != NULL);
+    assert(cond != NULL);
+    return (pthread_cond_wait(cond, mtx) == 0) ? thrd_success : thrd_error;
+}
+
+
+/*-------------------- 7.25.4 Mutex functions --------------------*/
+// 7.25.4.1
+static inline void
+mtx_destroy(mtx_t *mtx)
+{
+    assert(mtx != NULL);
+    pthread_mutex_destroy(mtx);
+}
+
+/*
+ * XXX: Workaround when building with -O0 and without pthreads link.
+ *
+ * In such cases constant folding and dead code elimination won't be
+ * available, thus the compiler will always add the pthread_mutexattr*
+ * functions into the binary. As we try to link, we'll fail as the
+ * symbols are unresolved.
+ *
+ * Ideally we'll enable the optimisations locally, yet that does not
+ * seem to work.
+ *
+ * So the alternative workaround is to annotate the symbols as weak.
+ * Thus the linker will be happy and things don't clash when building
+ * with -O1 or greater.
+ */
+#if defined(HAVE_FUNC_ATTRIBUTE_WEAK) && !defined(__CYGWIN__)
+__attribute__((weak))
+int pthread_mutexattr_init(pthread_mutexattr_t *attr);
+
+__attribute__((weak))
+int pthread_mutexattr_settype(pthread_mutexattr_t *attr, int type);
+
+__attribute__((weak))
+int pthread_mutexattr_destroy(pthread_mutexattr_t *attr);
+#endif
+
+// 7.25.4.2
+static inline int
+mtx_init(mtx_t *mtx, int type)
+{
+    pthread_mutexattr_t attr;
+    assert(mtx != NULL);
+    if (type != mtx_plain && type != mtx_timed && type != mtx_try
+      && type != (mtx_plain|mtx_recursive)
+      && type != (mtx_timed|mtx_recursive)
+      && type != (mtx_try|mtx_recursive))
+        return thrd_error;
+
+    if ((type & mtx_recursive) == 0) {
+        pthread_mutex_init(mtx, NULL);
+        return thrd_success;
+    }
+
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    pthread_mutex_init(mtx, &attr);
+    pthread_mutexattr_destroy(&attr);
+    return thrd_success;
+}
+
+// 7.25.4.3
+static inline int
+mtx_lock(mtx_t *mtx)
+{
+    assert(mtx != NULL);
+    return (pthread_mutex_lock(mtx) == 0) ? thrd_success : thrd_error;
+}
+
+static inline int
+mtx_trylock(mtx_t *mtx);
+
+static inline void
+thrd_yield(void);
+
+// 7.25.4.4
+static inline int
+mtx_timedlock(mtx_t *mtx, const struct timespec *ts)
+{
+    assert(mtx != NULL);
+    assert(ts != NULL);
+
+    {
+#ifdef EMULATED_THREADS_USE_NATIVE_TIMEDLOCK
+    int rt;
+    rt = pthread_mutex_timedlock(mtx, ts);
+    if (rt == 0)
+        return thrd_success;
+    return (rt == ETIMEDOUT) ? thrd_busy : thrd_error;
+#else
+    time_t expire = time(NULL);
+    expire += ts->tv_sec;
+    while (mtx_trylock(mtx) != thrd_success) {
+        time_t now = time(NULL);
+        if (expire < now)
+            return thrd_busy;
+        // busy loop!
+        thrd_yield();
+    }
+    return thrd_success;
+#endif
+    }
+}
+
+// 7.25.4.5
+static inline int
+mtx_trylock(mtx_t *mtx)
+{
+    assert(mtx != NULL);
+    return (pthread_mutex_trylock(mtx) == 0) ? thrd_success : thrd_busy;
+}
+
+// 7.25.4.6
+static inline int
+mtx_unlock(mtx_t *mtx)
+{
+    assert(mtx != NULL);
+    return (pthread_mutex_unlock(mtx) == 0) ? thrd_success : thrd_error;
+}
+
+
+/*------------------- 7.25.5 Thread functions -------------------*/
+// 7.25.5.1
+static inline int
+thrd_create(thrd_t *thr, thrd_start_t func, void *arg)
+{
+    struct impl_thrd_param *pack;
+    assert(thr != NULL);
+    pack = (struct impl_thrd_param *)malloc(sizeof(struct impl_thrd_param));
+    if (!pack) return thrd_nomem;
+    pack->func = func;
+    pack->arg = arg;
+    if (pthread_create(thr, NULL, impl_thrd_routine, pack) != 0) {
+        free(pack);
+        return thrd_error;
+    }
+    return thrd_success;
+}
+
+// 7.25.5.2
+static inline thrd_t
+thrd_current(void)
+{
+    return pthread_self();
+}
+
+// 7.25.5.3
+static inline int
+thrd_detach(thrd_t thr)
+{
+    return (pthread_detach(thr) == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.5.4
+static inline int
+thrd_equal(thrd_t thr0, thrd_t thr1)
+{
+    return pthread_equal(thr0, thr1);
+}
+
+// 7.25.5.5
+static inline void
+thrd_exit(int res)
+{
+    pthread_exit((void*)(intptr_t)res);
+}
+
+// 7.25.5.6
+static inline int
+thrd_join(thrd_t thr, int *res)
+{
+    void *code;
+    if (pthread_join(thr, &code) != 0)
+        return thrd_error;
+    if (res)
+        *res = (int)(intptr_t)code;
+    return thrd_success;
+}
+
+// 7.25.5.7
+static inline void
+thrd_sleep(const struct timespec *time_point, struct timespec *remaining)
+{
+    assert(time_point != NULL);
+    nanosleep(time_point, remaining);
+}
+
+// 7.25.5.8
+static inline void
+thrd_yield(void)
+{
+    sched_yield();
+}
+
+
+/*----------- 7.25.6 Thread-specific storage functions -----------*/
+// 7.25.6.1
+static inline int
+tss_create(tss_t *key, tss_dtor_t dtor)
+{
+    assert(key != NULL);
+    return (pthread_key_create(key, dtor) == 0) ? thrd_success : thrd_error;
+}
+
+// 7.25.6.2
+static inline void
+tss_delete(tss_t key)
+{
+    pthread_key_delete(key);
+}
+
+// 7.25.6.3
+static inline void *
+tss_get(tss_t key)
+{
+    return pthread_getspecific(key);
+}
+
+// 7.25.6.4
+static inline int
+tss_set(tss_t key, void *val)
+{
+    return (pthread_setspecific(key, val) == 0) ? thrd_success : thrd_error;
+}
+
+
+/*-------------------- 7.25.7 Time functions --------------------*/
+// 7.25.6.1
+#ifndef HAVE_TIMESPEC_GET
+static inline int
+timespec_get(struct timespec *ts, int base)
+{
+    if (!ts) return 0;
+    if (base == TIME_UTC) {
+        clock_gettime(CLOCK_REALTIME, ts);
+        return base;
+    }
+    return 0;
+}
+#endif
diff --git a/src/mesa/c11/threads_win32.h b/src/mesa/c11/threads_win32.h
new file mode 100644
index 00000000..326cfc46
--- /dev/null
+++ b/src/mesa/c11/threads_win32.h
@@ -0,0 +1,653 @@
+/*
+ * C11 <threads.h> emulation library
+ *
+ * (C) Copyright yohhoy 2012.
+ * Distributed under the Boost Software License, Version 1.0.
+ *
+ * Permission is hereby granted, free of charge, to any person or organization
+ * obtaining a copy of the software and accompanying documentation covered by
+ * this license (the "Software") to use, reproduce, display, distribute,
+ * execute, and transmit the Software, and to prepare [[derivative work]]s of the
+ * Software, and to permit third-parties to whom the Software is furnished to
+ * do so, all subject to the following:
+ *
+ * The copyright notices in the Software and this entire statement, including
+ * the above license grant, this restriction and the following disclaimer,
+ * must be included in all copies of the Software, in whole or in part, and
+ * all derivative works of the Software, unless such copies or derivative
+ * works are solely in the form of machine-executable object code generated by
+ * a source language processor.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+ * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+ * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+#ifndef assert
+#include <assert.h>
+#endif
+#include <limits.h>
+#include <errno.h>
+#include <process.h>  // MSVCRT
+#include <stdlib.h>
+
+/*
+Configuration macro:
+
+  EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+    Use native WindowsAPI one-time initialization function.
+    (requires WinVista or later)
+    Otherwise emulate by mtx_trylock() + *busy loop* for WinXP.
+
+  EMULATED_THREADS_USE_NATIVE_CV
+    Use native WindowsAPI condition variable object.
+    (requires WinVista or later)
+    Otherwise use emulated implementation for WinXP.
+
+  EMULATED_THREADS_TSS_DTOR_SLOTNUM
+    Max registerable TSS dtor number.
+*/
+
+// XXX: Retain XP compatability
+#if 0
+#if _WIN32_WINNT >= 0x0600
+// Prefer native WindowsAPI on newer environment.
+#if !defined(__MINGW32__)
+#define EMULATED_THREADS_USE_NATIVE_CALL_ONCE 
+#endif
+#define EMULATED_THREADS_USE_NATIVE_CV
+#endif
+#endif
+#define EMULATED_THREADS_TSS_DTOR_SLOTNUM 64  // see TLS_MINIMUM_AVAILABLE
+
+
+#include <windows.h>
+
+// check configuration
+#if defined(EMULATED_THREADS_USE_NATIVE_CALL_ONCE) && (_WIN32_WINNT < 0x0600)
+#error EMULATED_THREADS_USE_NATIVE_CALL_ONCE requires _WIN32_WINNT>=0x0600
+#endif
+
+#if defined(EMULATED_THREADS_USE_NATIVE_CV) && (_WIN32_WINNT < 0x0600)
+#error EMULATED_THREADS_USE_NATIVE_CV requires _WIN32_WINNT>=0x0600
+#endif
+
+/* Visual Studio 2015 and later */
+#ifdef _MSC_VER
+#define HAVE_TIMESPEC_GET
+#endif
+
+/*---------------------------- macros ----------------------------*/
+#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+#define ONCE_FLAG_INIT INIT_ONCE_STATIC_INIT
+#else
+#define ONCE_FLAG_INIT {0}
+#endif
+#define TSS_DTOR_ITERATIONS 1
+
+// FIXME: temporary non-standard hack to ease transition
+#define _MTX_INITIALIZER_NP {(PCRITICAL_SECTION_DEBUG)-1, -1, 0, 0, 0, 0}
+
+/*---------------------------- types ----------------------------*/
+typedef struct cnd_t {
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    CONDITION_VARIABLE condvar;
+#else
+    int blocked;
+    int gone;
+    int to_unblock;
+    HANDLE sem_queue;
+    HANDLE sem_gate;
+    CRITICAL_SECTION monitor;
+#endif
+} cnd_t;
+
+typedef HANDLE thrd_t;
+
+typedef DWORD tss_t;
+
+typedef CRITICAL_SECTION mtx_t;
+
+#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+typedef INIT_ONCE once_flag;
+#else
+typedef struct once_flag_t {
+    volatile LONG status;
+} once_flag;
+#endif
+
+
+static inline void * tss_get(tss_t key);
+static inline void thrd_yield(void);
+static inline int mtx_trylock(mtx_t *mtx);
+static inline int mtx_lock(mtx_t *mtx);
+static inline int mtx_unlock(mtx_t *mtx);
+
+/*
+Implementation limits:
+  - Conditionally emulation for "Initialization functions"
+    (see EMULATED_THREADS_USE_NATIVE_CALL_ONCE macro)
+  - Emulated `mtx_timelock()' with mtx_trylock() + *busy loop*
+*/
+static void impl_tss_dtor_invoke(void);  // forward decl.
+
+struct impl_thrd_param {
+    thrd_start_t func;
+    void *arg;
+};
+
+static unsigned __stdcall impl_thrd_routine(void *p)
+{
+    struct impl_thrd_param pack;
+    int code;
+    memcpy(&pack, p, sizeof(struct impl_thrd_param));
+    free(p);
+    code = pack.func(pack.arg);
+    impl_tss_dtor_invoke();
+    return (unsigned)code;
+}
+
+static DWORD impl_timespec2msec(const struct timespec *ts)
+{
+    return (DWORD)((ts->tv_sec * 1000U) + (ts->tv_nsec / 1000000L));
+}
+
+#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+struct impl_call_once_param { void (*func)(void); };
+static BOOL CALLBACK impl_call_once_callback(PINIT_ONCE InitOnce, PVOID Parameter, PVOID *Context)
+{
+    struct impl_call_once_param *param = (struct impl_call_once_param*)Parameter;
+    (param->func)();
+    ((void)InitOnce); ((void)Context);  // suppress warning
+    return TRUE;
+}
+#endif  // ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+
+#ifndef EMULATED_THREADS_USE_NATIVE_CV
+/*
+Note:
+  The implementation of condition variable is ported from Boost.Interprocess
+  See http://www.boost.org/boost/interprocess/sync/windows/condition.hpp
+*/
+static void impl_cond_do_signal(cnd_t *cond, int broadcast)
+{
+    int nsignal = 0;
+
+    EnterCriticalSection(&cond->monitor);
+    if (cond->to_unblock != 0) {
+        if (cond->blocked == 0) {
+            LeaveCriticalSection(&cond->monitor);
+            return;
+        }
+        if (broadcast) {
+            cond->to_unblock += nsignal = cond->blocked;
+            cond->blocked = 0;
+        } else {
+            nsignal = 1;
+            cond->to_unblock++;
+            cond->blocked--;
+        }
+    } else if (cond->blocked > cond->gone) {
+        WaitForSingleObject(cond->sem_gate, INFINITE);
+        if (cond->gone != 0) {
+            cond->blocked -= cond->gone;
+            cond->gone = 0;
+        }
+        if (broadcast) {
+            nsignal = cond->to_unblock = cond->blocked;
+            cond->blocked = 0;
+        } else {
+            nsignal = cond->to_unblock = 1;
+            cond->blocked--;
+        }
+    }
+    LeaveCriticalSection(&cond->monitor);
+
+    if (0 < nsignal)
+        ReleaseSemaphore(cond->sem_queue, nsignal, NULL);
+}
+
+static int impl_cond_do_wait(cnd_t *cond, mtx_t *mtx, const struct timespec *ts)
+{
+    int nleft = 0;
+    int ngone = 0;
+    int timeout = 0;
+    DWORD w;
+
+    WaitForSingleObject(cond->sem_gate, INFINITE);
+    cond->blocked++;
+    ReleaseSemaphore(cond->sem_gate, 1, NULL);
+
+    mtx_unlock(mtx);
+
+    w = WaitForSingleObject(cond->sem_queue, ts ? impl_timespec2msec(ts) : INFINITE);
+    timeout = (w == WAIT_TIMEOUT);
+ 
+    EnterCriticalSection(&cond->monitor);
+    if ((nleft = cond->to_unblock) != 0) {
+        if (timeout) {
+            if (cond->blocked != 0) {
+                cond->blocked--;
+            } else {
+                cond->gone++;
+            }
+        }
+        if (--cond->to_unblock == 0) {
+            if (cond->blocked != 0) {
+                ReleaseSemaphore(cond->sem_gate, 1, NULL);
+                nleft = 0;
+            }
+            else if ((ngone = cond->gone) != 0) {
+                cond->gone = 0;
+            }
+        }
+    } else if (++cond->gone == INT_MAX/2) {
+        WaitForSingleObject(cond->sem_gate, INFINITE);
+        cond->blocked -= cond->gone;
+        ReleaseSemaphore(cond->sem_gate, 1, NULL);
+        cond->gone = 0;
+    }
+    LeaveCriticalSection(&cond->monitor);
+
+    if (nleft == 1) {
+        while (ngone--)
+            WaitForSingleObject(cond->sem_queue, INFINITE);
+        ReleaseSemaphore(cond->sem_gate, 1, NULL);
+    }
+
+    mtx_lock(mtx);
+    return timeout ? thrd_busy : thrd_success;
+}
+#endif  // ifndef EMULATED_THREADS_USE_NATIVE_CV
+
+static struct impl_tss_dtor_entry {
+    tss_t key;
+    tss_dtor_t dtor;
+} impl_tss_dtor_tbl[EMULATED_THREADS_TSS_DTOR_SLOTNUM];
+
+static int impl_tss_dtor_register(tss_t key, tss_dtor_t dtor)
+{
+    int i;
+    for (i = 0; i < EMULATED_THREADS_TSS_DTOR_SLOTNUM; i++) {
+        if (!impl_tss_dtor_tbl[i].dtor)
+            break;
+    }
+    if (i == EMULATED_THREADS_TSS_DTOR_SLOTNUM)
+        return 1;
+    impl_tss_dtor_tbl[i].key = key;
+    impl_tss_dtor_tbl[i].dtor = dtor;
+    return 0;
+}
+
+static void impl_tss_dtor_invoke()
+{
+    int i;
+    for (i = 0; i < EMULATED_THREADS_TSS_DTOR_SLOTNUM; i++) {
+        if (impl_tss_dtor_tbl[i].dtor) {
+            void* val = tss_get(impl_tss_dtor_tbl[i].key);
+            if (val)
+                (impl_tss_dtor_tbl[i].dtor)(val);
+        }
+    }
+}
+
+
+/*--------------- 7.25.2 Initialization functions ---------------*/
+// 7.25.2.1
+static inline void
+call_once(once_flag *flag, void (*func)(void))
+{
+    assert(flag && func);
+#ifdef EMULATED_THREADS_USE_NATIVE_CALL_ONCE
+    {
+    struct impl_call_once_param param;
+    param.func = func;
+    InitOnceExecuteOnce(flag, impl_call_once_callback, (PVOID)&param, NULL);
+    }
+#else
+    if (InterlockedCompareExchange(&flag->status, 1, 0) == 0) {
+        (func)();
+        InterlockedExchange(&flag->status, 2);
+    } else {
+        while (flag->status == 1) {
+            // busy loop!
+            thrd_yield();
+        }
+    }
+#endif
+}
+
+
+/*------------- 7.25.3 Condition variable functions -------------*/
+// 7.25.3.1
+static inline int
+cnd_broadcast(cnd_t *cond)
+{
+    if (!cond) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    WakeAllConditionVariable(&cond->condvar);
+#else
+    impl_cond_do_signal(cond, 1);
+#endif
+    return thrd_success;
+}
+
+// 7.25.3.2
+static inline void
+cnd_destroy(cnd_t *cond)
+{
+    assert(cond);
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    // do nothing
+#else
+    CloseHandle(cond->sem_queue);
+    CloseHandle(cond->sem_gate);
+    DeleteCriticalSection(&cond->monitor);
+#endif
+}
+
+// 7.25.3.3
+static inline int
+cnd_init(cnd_t *cond)
+{
+    if (!cond) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    InitializeConditionVariable(&cond->condvar);
+#else
+    cond->blocked = 0;
+    cond->gone = 0;
+    cond->to_unblock = 0;
+    cond->sem_queue = CreateSemaphore(NULL, 0, LONG_MAX, NULL);
+    cond->sem_gate = CreateSemaphore(NULL, 1, 1, NULL);
+    InitializeCriticalSection(&cond->monitor);
+#endif
+    return thrd_success;
+}
+
+// 7.25.3.4
+static inline int
+cnd_signal(cnd_t *cond)
+{
+    if (!cond) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    WakeConditionVariable(&cond->condvar);
+#else
+    impl_cond_do_signal(cond, 0);
+#endif
+    return thrd_success;
+}
+
+// 7.25.3.5
+static inline int
+cnd_timedwait(cnd_t *cond, mtx_t *mtx, const struct timespec *abs_time)
+{
+    if (!cond || !mtx || !abs_time) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    if (SleepConditionVariableCS(&cond->condvar, mtx, impl_timespec2msec(abs_time)))
+        return thrd_success;
+    return (GetLastError() == ERROR_TIMEOUT) ? thrd_busy : thrd_error;
+#else
+    return impl_cond_do_wait(cond, mtx, abs_time);
+#endif
+}
+
+// 7.25.3.6
+static inline int
+cnd_wait(cnd_t *cond, mtx_t *mtx)
+{
+    if (!cond || !mtx) return thrd_error;
+#ifdef EMULATED_THREADS_USE_NATIVE_CV
+    SleepConditionVariableCS(&cond->condvar, mtx, INFINITE);
+#else
+    impl_cond_do_wait(cond, mtx, NULL);
+#endif
+    return thrd_success;
+}
+
+
+/*-------------------- 7.25.4 Mutex functions --------------------*/
+// 7.25.4.1
+static inline void
+mtx_destroy(mtx_t *mtx)
+{
+    assert(mtx);
+    DeleteCriticalSection(mtx);
+}
+
+// 7.25.4.2
+static inline int
+mtx_init(mtx_t *mtx, int type)
+{
+    if (!mtx) return thrd_error;
+    if (type != mtx_plain && type != mtx_timed && type != mtx_try
+      && type != (mtx_plain|mtx_recursive)
+      && type != (mtx_timed|mtx_recursive)
+      && type != (mtx_try|mtx_recursive))
+        return thrd_error;
+    InitializeCriticalSection(mtx);
+    return thrd_success;
+}
+
+// 7.25.4.3
+static inline int
+mtx_lock(mtx_t *mtx)
+{
+    if (!mtx) return thrd_error;
+    EnterCriticalSection(mtx);
+    return thrd_success;
+}
+
+// 7.25.4.4
+static inline int
+mtx_timedlock(mtx_t *mtx, const struct timespec *ts)
+{
+    time_t expire, now;
+    if (!mtx || !ts) return thrd_error;
+    expire = time(NULL);
+    expire += ts->tv_sec;
+    while (mtx_trylock(mtx) != thrd_success) {
+        now = time(NULL);
+        if (expire < now)
+            return thrd_busy;
+        // busy loop!
+        thrd_yield();
+    }
+    return thrd_success;
+}
+
+// 7.25.4.5
+static inline int
+mtx_trylock(mtx_t *mtx)
+{
+    if (!mtx) return thrd_error;
+    return TryEnterCriticalSection(mtx) ? thrd_success : thrd_busy;
+}
+
+// 7.25.4.6
+static inline int
+mtx_unlock(mtx_t *mtx)
+{
+    if (!mtx) return thrd_error;
+    LeaveCriticalSection(mtx);
+    return thrd_success;
+}
+
+
+/*------------------- 7.25.5 Thread functions -------------------*/
+// 7.25.5.1
+static inline int
+thrd_create(thrd_t *thr, thrd_start_t func, void *arg)
+{
+    struct impl_thrd_param *pack;
+    uintptr_t handle;
+    if (!thr) return thrd_error;
+    pack = (struct impl_thrd_param *)malloc(sizeof(struct impl_thrd_param));
+    if (!pack) return thrd_nomem;
+    pack->func = func;
+    pack->arg = arg;
+    handle = _beginthreadex(NULL, 0, impl_thrd_routine, pack, 0, NULL);
+    if (handle == 0) {
+        if (errno == EAGAIN || errno == EACCES)
+            return thrd_nomem;
+        return thrd_error;
+    }
+    *thr = (thrd_t)handle;
+    return thrd_success;
+}
+
+#if 0
+// 7.25.5.2
+static inline thrd_t
+thrd_current(void)
+{
+    HANDLE hCurrentThread;
+    BOOL bRet;
+
+    /* GetCurrentThread() returns a pseudo-handle, which we need
+     * to pass to DuplicateHandle(). Only the resulting handle can be used
+     * from other threads.
+     *
+     * Note that neither handle can be compared to the one by thread_create.
+     * Only the thread IDs - as returned by GetThreadId() and GetCurrentThreadId()
+     * can be compared directly.
+     *
+     * Other potential solutions would be:
+     * - define thrd_t as a thread Ids, but this would mean we'd need to OpenThread for many operations
+     * - use malloc'ed memory for thrd_t. This would imply using TLS for current thread.
+     *
+     * Neither is particularly nice.
+     *
+     * Life would be much easier if C11 threads had different abstractions for
+     * threads and thread IDs, just like C++11 threads does...
+     */
+
+    bRet = DuplicateHandle(GetCurrentProcess(), // source process (pseudo) handle
+                           GetCurrentThread(), // source (pseudo) handle
+                           GetCurrentProcess(), // target process
+                           &hCurrentThread, // target handle
+                           0,
+                           FALSE,
+                           DUPLICATE_SAME_ACCESS);
+    assert(bRet);
+    if (!bRet) {
+	hCurrentThread = GetCurrentThread();
+    }
+    return hCurrentThread;
+}
+#endif
+
+// 7.25.5.3
+static inline int
+thrd_detach(thrd_t thr)
+{
+    CloseHandle(thr);
+    return thrd_success;
+}
+
+// 7.25.5.4
+static inline int
+thrd_equal(thrd_t thr0, thrd_t thr1)
+{
+    return GetThreadId(thr0) == GetThreadId(thr1);
+}
+
+// 7.25.5.5
+static inline void
+thrd_exit(int res)
+{
+    impl_tss_dtor_invoke();
+    _endthreadex((unsigned)res);
+}
+
+// 7.25.5.6
+static inline int
+thrd_join(thrd_t thr, int *res)
+{
+    DWORD w, code;
+    w = WaitForSingleObject(thr, INFINITE);
+    if (w != WAIT_OBJECT_0)
+        return thrd_error;
+    if (res) {
+        if (!GetExitCodeThread(thr, &code)) {
+            CloseHandle(thr);
+            return thrd_error;
+        }
+        *res = (int)code;
+    }
+    CloseHandle(thr);
+    return thrd_success;
+}
+
+// 7.25.5.7
+static inline void
+thrd_sleep(const struct timespec *time_point, struct timespec *remaining)
+{
+    assert(time_point);
+    assert(!remaining); /* not implemented */
+    Sleep(impl_timespec2msec(time_point));
+}
+
+// 7.25.5.8
+static inline void
+thrd_yield(void)
+{
+    SwitchToThread();
+}
+
+
+/*----------- 7.25.6 Thread-specific storage functions -----------*/
+// 7.25.6.1
+static inline int
+tss_create(tss_t *key, tss_dtor_t dtor)
+{
+    if (!key) return thrd_error;
+    *key = TlsAlloc();
+    if (dtor) {
+        if (impl_tss_dtor_register(*key, dtor)) {
+            TlsFree(*key);
+            return thrd_error;
+        }
+    }
+    return (*key != 0xFFFFFFFF) ? thrd_success : thrd_error;
+}
+
+// 7.25.6.2
+static inline void
+tss_delete(tss_t key)
+{
+    TlsFree(key);
+}
+
+// 7.25.6.3
+static inline void *
+tss_get(tss_t key)
+{
+    return TlsGetValue(key);
+}
+
+// 7.25.6.4
+static inline int
+tss_set(tss_t key, void *val)
+{
+    return TlsSetValue(key, val) ? thrd_success : thrd_error;
+}
+
+
+/*-------------------- 7.25.7 Time functions --------------------*/
+// 7.25.6.1
+#ifndef HAVE_TIMESPEC_GET
+static inline int
+timespec_get(struct timespec *ts, int base)
+{
+    if (!ts) return 0;
+    if (base == TIME_UTC) {
+        ts->tv_sec = time(NULL);
+        ts->tv_nsec = 0;
+        return base;
+    }
+    return 0;
+}
+#endif
diff --git a/src/mesa/c11_compat.h b/src/mesa/c11_compat.h
new file mode 100644
index 00000000..d35740f4
--- /dev/null
+++ b/src/mesa/c11_compat.h
@@ -0,0 +1,27 @@
+/* Copyright 2019 Intel Corporation */
+/* SPDX-License-Identifier: MIT */
+
+#include "no_extern_c.h"
+
+#ifndef _C11_COMPAT_H_
+#define _C11_COMPAT_H_
+
+#if defined(__cplusplus)
+   /* This is C++ code, not C */
+#elif (__STDC_VERSION__ >= 201112L)
+   /* Already C11 */
+#else
+
+
+/*
+ * C11 static_assert() macro
+ * assert.h only defines that name for C11 and above
+ */
+#ifndef static_assert
+#define static_assert _Static_assert
+#endif
+
+
+#endif /* !C++ && !C11 */
+
+#endif /* _C11_COMPAT_H_ */
diff --git a/src/mesa/c99_compat.h b/src/mesa/c99_compat.h
new file mode 100644
index 00000000..729b5b79
--- /dev/null
+++ b/src/mesa/c99_compat.h
@@ -0,0 +1,183 @@
+/**************************************************************************
+ *
+ * Copyright 2007-2013 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "no_extern_c.h"
+
+#ifndef _C99_COMPAT_H_
+#define _C99_COMPAT_H_
+
+
+/*
+ * MSVC hacks.
+ */
+#if defined(_MSC_VER)
+
+#  if _MSC_VER < 1900
+#    error "Microsoft Visual Studio 2015 or higher required"
+#  endif
+
+   /*
+    * Visual Studio will complain if we define the `inline` keyword, but
+    * actually it only supports the keyword on C++.
+    *
+    * To avoid this the _ALLOW_KEYWORD_MACROS must be set.
+    */
+#  if !defined(_ALLOW_KEYWORD_MACROS)
+#    define _ALLOW_KEYWORD_MACROS
+#  endif
+
+   /*
+    * XXX: MSVC has a `__restrict` keyword, but it also has a
+    * `__declspec(restrict)` modifier, so it is impossible to define a
+    * `restrict` macro without interfering with the latter.  Furthermore the
+    * MSVC standard library uses __declspec(restrict) under the _CRTRESTRICT
+    * macro.  For now resolve this issue by redefining _CRTRESTRICT, but going
+    * forward we should probably should stop using restrict, especially
+    * considering that our code does not obbey strict aliasing rules any way.
+    */
+#  include <crtdefs.h>
+#  undef _CRTRESTRICT
+#  define _CRTRESTRICT
+#endif
+
+
+/*
+ * C99 inline keyword
+ */
+#ifndef inline
+#  ifdef __cplusplus
+     /* C++ supports inline keyword */
+#  elif defined(__GNUC__)
+#    define inline __inline__
+#  elif defined(_MSC_VER)
+#    define inline __inline
+#  elif defined(__ICL)
+#    define inline __inline
+#  elif defined(__INTEL_COMPILER)
+     /* Intel compiler supports inline keyword */
+#  elif defined(__WATCOMC__) && (__WATCOMC__ >= 1100)
+#    define inline __inline
+#  elif (__STDC_VERSION__ >= 199901L)
+     /* C99 supports inline keyword */
+#  else
+#    define inline
+#  endif
+#endif
+
+
+/*
+ * C99 restrict keyword
+ *
+ * See also:
+ * - http://cellperformance.beyond3d.com/articles/2006/05/demystifying-the-restrict-keyword.html
+ */
+#ifndef restrict
+#  if (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus)
+     /* C99 */
+#  elif defined(__GNUC__)
+#    define restrict __restrict__
+#  elif defined(_MSC_VER)
+#    define restrict __restrict
+#  else
+#    define restrict /* */
+#  endif
+#endif
+
+
+/*
+ * C99 __func__ macro
+ */
+#ifndef __func__
+#  if (__STDC_VERSION__ >= 199901L)
+     /* C99 */
+#  elif defined(__GNUC__)
+#    define __func__ __FUNCTION__
+#  elif defined(_MSC_VER)
+#    define __func__ __FUNCTION__
+#  else
+#    define __func__ "<unknown>"
+#  endif
+#endif
+
+
+/* Simple test case for debugging */
+#if 0
+static inline const char *
+test_c99_compat_h(const void * restrict a,
+                  const void * restrict b)
+{
+   return __func__;
+}
+#endif
+
+
+/* Fallback definitions, for scons which doesn't auto-detect these things. */
+#ifdef HAVE_SCONS
+
+#  ifndef _WIN32
+#    define HAVE_PTHREAD
+#    define HAVE_POSIX_MEMALIGN
+#  endif
+
+#  ifdef __GNUC__
+#    if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 2)
+#      error "GCC version 4.2 or higher required"
+#    endif
+
+     /* https://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcc/Other-Builtins.html */
+#    define HAVE___BUILTIN_CLZ 1
+#    define HAVE___BUILTIN_CLZLL 1
+#    define HAVE___BUILTIN_CTZ 1
+#    define HAVE___BUILTIN_EXPECT 1
+#    define HAVE___BUILTIN_FFS 1
+#    define HAVE___BUILTIN_FFSLL 1
+#    define HAVE___BUILTIN_POPCOUNT 1
+#    define HAVE___BUILTIN_POPCOUNTLL 1
+     /* https://gcc.gnu.org/onlinedocs/gcc-4.2.4/gcc/Function-Attributes.html */
+#    define HAVE_FUNC_ATTRIBUTE_FLATTEN 1
+#    define HAVE_FUNC_ATTRIBUTE_UNUSED 1
+#    define HAVE_FUNC_ATTRIBUTE_FORMAT 1
+#    define HAVE_FUNC_ATTRIBUTE_PACKED 1
+#    define HAVE_FUNC_ATTRIBUTE_ALIAS 1
+#    define HAVE_FUNC_ATTRIBUTE_NORETURN 1
+
+#    if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
+       /* https://gcc.gnu.org/onlinedocs/gcc-4.3.6/gcc/Other-Builtins.html */
+#      define HAVE___BUILTIN_BSWAP32 1
+#      define HAVE___BUILTIN_BSWAP64 1
+#    endif
+
+#    if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+#      define HAVE___BUILTIN_UNREACHABLE 1
+#    endif
+
+#  endif /* __GNUC__ */
+
+#endif /* HAVE_SCONS */
+
+
+#endif /* _C99_COMPAT_H_ */
diff --git a/src/mesa/fast_urem_by_const.h b/src/mesa/fast_urem_by_const.h
new file mode 100644
index 00000000..beb253d2
--- /dev/null
+++ b/src/mesa/fast_urem_by_const.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright © 2010 Valve Software
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdint.h>
+
+/*
+ * Code for fast 32-bit unsigned remainder, based off of "Faster Remainder by
+ * Direct Computation: Applications to Compilers and Software Libraries,"
+ * available at https://arxiv.org/pdf/1902.01961.pdf.
+ *
+ * util_fast_urem32(n, d, REMAINDER_MAGIC(d)) returns the same thing as
+ * n % d for any unsigned n and d, however it compiles down to only a few
+ * multiplications, so it should be faster than plain uint32_t modulo if the
+ * same divisor is used many times.
+ */
+
+#define REMAINDER_MAGIC(divisor) \
+   ((uint64_t) ~0ull / (divisor) + 1)
+
+/*
+ * Get bits 64-96 of a 32x64-bit multiply. If __int128_t is available, we use
+ * it, which usually compiles down to one instruction on 64-bit architectures.
+ * Otherwise on 32-bit architectures we usually get four instructions (one
+ * 32x32->64 multiply, one 32x32->32 multiply, and one 64-bit add).
+ */
+
+static inline uint32_t
+_mul32by64_hi(uint32_t a, uint64_t b)
+{
+#ifdef HAVE_UINT128
+   return ((__uint128_t) b * a) >> 64;
+#else
+   /*
+    * Let b = b0 + 2^32 * b1. Then a * b = a * b0 + 2^32 * a * b1. We would
+    * have to do a 96-bit addition to get the full result, except that only
+    * one term has non-zero lower 32 bits, which means that to get the high 32
+    * bits, we only have to add the high 64 bits of each term. Unfortunately,
+    * we have to do the 64-bit addition in case the low 32 bits overflow.
+    */
+   uint32_t b0 = (uint32_t) b;
+   uint32_t b1 = b >> 32;
+   return ((((uint64_t) a * b0) >> 32) + (uint64_t) a * b1) >> 32;
+#endif
+}
+
+static inline uint32_t
+util_fast_urem32(uint32_t n, uint32_t d, uint64_t magic)
+{
+   uint64_t lowbits = magic * n;
+   uint32_t result = _mul32by64_hi(d, lowbits);
+   assert(result == n % d);
+   return result;
+}
+
diff --git a/src/mesa/main/hash.c b/src/mesa/main/hash.c
new file mode 100644
index 00000000..1b1d9546
--- /dev/null
+++ b/src/mesa/main/hash.c
@@ -0,0 +1,425 @@
+/**
+ * \file hash.c
+ * Generic hash table. 
+ *
+ * Used for display lists, texture objects, vertex/fragment programs,
+ * buffer objects, etc.  The hash functions are thread-safe.
+ * 
+ * \note key=0 is illegal.
+ *
+ * \author Brian Paul
+ */
+
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+//#include "errors.h"
+#include <GL/gl.h>
+#include "hash.h"
+#include "../util/hash_table.h"
+
+
+/**
+ * Create a new hash table.
+ * 
+ * \return pointer to a new, empty hash table.
+ */
+struct _mesa_HashTable *
+_mesa_NewHashTable(void)
+{
+   struct _mesa_HashTable *table = CALLOC_STRUCT(_mesa_HashTable);
+
+   if (table) {
+      table->ht = _mesa_hash_table_create(NULL, uint_key_hash,
+                                          uint_key_compare);
+      if (table->ht == NULL) {
+         free(table);
+         //_mesa_error_no_memory(__func__);
+         return NULL;
+      }
+
+      _mesa_hash_table_set_deleted_key(table->ht, uint_key(DELETED_KEY_VALUE));
+      /*
+       * Needs to be recursive, since the callback in _mesa_HashWalk()
+       * is allowed to call _mesa_HashRemove().
+       */
+      mtx_init(&table->Mutex, mtx_recursive);
+   }
+   else {
+      //_mesa_error_no_memory(__func__);
+   }
+
+   return table;
+}
+
+
+
+/**
+ * Delete a hash table.
+ * Frees each entry on the hash table and then the hash table structure itself.
+ * Note that the caller should have already traversed the table and deleted
+ * the objects in the table (i.e. We don't free the entries' data pointer).
+ *
+ * \param table the hash table to delete.
+ */
+void
+_mesa_DeleteHashTable(struct _mesa_HashTable *table)
+{
+   assert(table);
+
+   if (_mesa_hash_table_next_entry(table->ht, NULL) != NULL) {
+     // _mesa_problem(NULL, "In _mesa_DeleteHashTable, found non-freed data");
+   }
+
+   _mesa_hash_table_destroy(table->ht, NULL);
+
+   mtx_destroy(&table->Mutex);
+   free(table);
+}
+
+
+
+/**
+ * Lookup an entry in the hash table, without locking.
+ * \sa _mesa_HashLookup
+ */
+static inline void *
+_mesa_HashLookup_unlocked(struct _mesa_HashTable *table, GLuint key)
+{
+   const struct hash_entry *entry;
+
+   assert(table);
+   assert(key);
+
+   if (key == DELETED_KEY_VALUE)
+      return table->deleted_key_data;
+
+   entry = _mesa_hash_table_search_pre_hashed(table->ht,
+                                              uint_hash(key),
+                                              uint_key(key));
+   if (!entry)
+      return NULL;
+
+   return entry->data;
+}
+
+
+/**
+ * Lookup an entry in the hash table.
+ * 
+ * \param table the hash table.
+ * \param key the key.
+ * 
+ * \return pointer to user's data or NULL if key not in table
+ */
+void *
+_mesa_HashLookup(struct _mesa_HashTable *table, GLuint key)
+{
+   void *res;
+   _mesa_HashLockMutex(table);
+   res = _mesa_HashLookup_unlocked(table, key);
+   _mesa_HashUnlockMutex(table);
+   return res;
+}
+
+
+/**
+ * Lookup an entry in the hash table without locking the mutex.
+ *
+ * The hash table mutex must be locked manually by calling
+ * _mesa_HashLockMutex() before calling this function.
+ *
+ * \param table the hash table.
+ * \param key the key.
+ *
+ * \return pointer to user's data or NULL if key not in table
+ */
+void *
+_mesa_HashLookupLocked(struct _mesa_HashTable *table, GLuint key)
+{
+   return _mesa_HashLookup_unlocked(table, key);
+}
+
+
+static inline void
+_mesa_HashInsert_unlocked(struct _mesa_HashTable *table, GLuint key, void *data)
+{
+   uint32_t hash = uint_hash(key);
+   struct hash_entry *entry;
+
+   assert(table);
+   assert(key);
+
+   if (key > table->MaxKey)
+      table->MaxKey = key;
+
+   if (key == DELETED_KEY_VALUE) {
+      table->deleted_key_data = data;
+   } else {
+      entry = _mesa_hash_table_search_pre_hashed(table->ht, hash, uint_key(key));
+      if (entry) {
+         entry->data = data;
+      } else {
+         _mesa_hash_table_insert_pre_hashed(table->ht, hash, uint_key(key), data);
+      }
+   }
+}
+
+
+/**
+ * Insert a key/pointer pair into the hash table without locking the mutex.
+ * If an entry with this key already exists we'll replace the existing entry.
+ *
+ * The hash table mutex must be locked manually by calling
+ * _mesa_HashLockMutex() before calling this function.
+ *
+ * \param table the hash table.
+ * \param key the key (not zero).
+ * \param data pointer to user data.
+ */
+void
+_mesa_HashInsertLocked(struct _mesa_HashTable *table, GLuint key, void *data)
+{
+   _mesa_HashInsert_unlocked(table, key, data);
+}
+
+
+/**
+ * Insert a key/pointer pair into the hash table.
+ * If an entry with this key already exists we'll replace the existing entry.
+ *
+ * \param table the hash table.
+ * \param key the key (not zero).
+ * \param data pointer to user data.
+ */
+void
+_mesa_HashInsert(struct _mesa_HashTable *table, GLuint key, void *data)
+{
+   _mesa_HashLockMutex(table);
+   _mesa_HashInsert_unlocked(table, key, data);
+   _mesa_HashUnlockMutex(table);
+}
+
+
+/**
+ * Remove an entry from the hash table.
+ * 
+ * \param table the hash table.
+ * \param key key of entry to remove.
+ *
+ * While holding the hash table's lock, searches the entry with the matching
+ * key and unlinks it.
+ */
+static inline void
+_mesa_HashRemove_unlocked(struct _mesa_HashTable *table, GLuint key)
+{
+   struct hash_entry *entry;
+
+   assert(table);
+   assert(key);
+
+   /* assert if _mesa_HashRemove illegally called from _mesa_HashDeleteAll
+    * callback function. Have to check this outside of mutex lock.
+    */
+   assert(!table->InDeleteAll);
+
+   if (key == DELETED_KEY_VALUE) {
+      table->deleted_key_data = NULL;
+   } else {
+      entry = _mesa_hash_table_search_pre_hashed(table->ht,
+                                                 uint_hash(key),
+                                                 uint_key(key));
+      _mesa_hash_table_remove(table->ht, entry);
+   }
+}
+
+
+void
+_mesa_HashRemoveLocked(struct _mesa_HashTable *table, GLuint key)
+{
+   _mesa_HashRemove_unlocked(table, key);
+}
+
+void
+_mesa_HashRemove(struct _mesa_HashTable *table, GLuint key)
+{
+   _mesa_HashLockMutex(table);
+   _mesa_HashRemove_unlocked(table, key);
+   _mesa_HashUnlockMutex(table);
+}
+
+/**
+ * Delete all entries in a hash table, but don't delete the table itself.
+ * Invoke the given callback function for each table entry.
+ *
+ * \param table  the hash table to delete
+ * \param callback  the callback function
+ * \param userData  arbitrary pointer to pass along to the callback
+ *                  (this is typically a struct gl_context pointer)
+ */
+void
+_mesa_HashDeleteAll(struct _mesa_HashTable *table,
+                    void (*callback)(GLuint key, void *data, void *userData),
+                    void *userData)
+{
+   assert(callback);
+   _mesa_HashLockMutex(table);
+   table->InDeleteAll = GL_TRUE;
+   hash_table_foreach(table->ht, entry) {
+      callback((uintptr_t)entry->key, entry->data, userData);
+      _mesa_hash_table_remove(table->ht, entry);
+   }
+   if (table->deleted_key_data) {
+      callback(DELETED_KEY_VALUE, table->deleted_key_data, userData);
+      table->deleted_key_data = NULL;
+   }
+   table->InDeleteAll = GL_FALSE;
+   _mesa_HashUnlockMutex(table);
+}
+
+
+/**
+ * Walk over all entries in a hash table, calling callback function for each.
+ * \param table  the hash table to walk
+ * \param callback  the callback function
+ * \param userData  arbitrary pointer to pass along to the callback
+ *                  (this is typically a struct gl_context pointer)
+ */
+static void
+hash_walk_unlocked(const struct _mesa_HashTable *table,
+                   void (*callback)(GLuint key, void *data, void *userData),
+                   void *userData)
+{
+   assert(table);
+   assert(callback);
+
+   hash_table_foreach(table->ht, entry) {
+      callback((uintptr_t)entry->key, entry->data, userData);
+   }
+   if (table->deleted_key_data)
+      callback(DELETED_KEY_VALUE, table->deleted_key_data, userData);
+}
+
+
+void
+_mesa_HashWalk(const struct _mesa_HashTable *table,
+               void (*callback)(GLuint key, void *data, void *userData),
+               void *userData)
+{
+   /* cast-away const */
+   struct _mesa_HashTable *table2 = (struct _mesa_HashTable *) table;
+
+   _mesa_HashLockMutex(table2);
+   hash_walk_unlocked(table, callback, userData);
+   _mesa_HashUnlockMutex(table2);
+}
+
+void
+_mesa_HashWalkLocked(const struct _mesa_HashTable *table,
+               void (*callback)(GLuint key, void *data, void *userData),
+               void *userData)
+{
+   hash_walk_unlocked(table, callback, userData);
+}
+
+static void
+debug_print_entry(GLuint key, void *data, void *userData)
+{
+   //_mesa_debug(NULL, "%u %p\n", key, data);
+}
+
+/**
+ * Dump contents of hash table for debugging.
+ * 
+ * \param table the hash table.
+ */
+void
+_mesa_HashPrint(const struct _mesa_HashTable *table)
+{
+   if (table->deleted_key_data)
+      debug_print_entry(DELETED_KEY_VALUE, table->deleted_key_data, NULL);
+   _mesa_HashWalk(table, debug_print_entry, NULL);
+}
+
+
+/**
+ * Find a block of adjacent unused hash keys.
+ * 
+ * \param table the hash table.
+ * \param numKeys number of keys needed.
+ * 
+ * \return Starting key of free block or 0 if failure.
+ *
+ * If there are enough free keys between the maximum key existing in the table
+ * (_mesa_HashTable::MaxKey) and the maximum key possible, then simply return
+ * the adjacent key. Otherwise do a full search for a free key block in the
+ * allowable key range.
+ */
+GLuint
+_mesa_HashFindFreeKeyBlock(struct _mesa_HashTable *table, GLuint numKeys)
+{
+   const GLuint maxKey = ~((GLuint) 0) - 1;
+   if (maxKey - numKeys > table->MaxKey) {
+      /* the quick solution */
+      return table->MaxKey + 1;
+   }
+   else {
+      /* the slow solution */
+      GLuint freeCount = 0;
+      GLuint freeStart = 1;
+      GLuint key;
+      for (key = 1; key != maxKey; key++) {
+	 if (_mesa_HashLookup_unlocked(table, key)) {
+	    /* darn, this key is already in use */
+	    freeCount = 0;
+	    freeStart = key+1;
+	 }
+	 else {
+	    /* this key not in use, check if we've found enough */
+	    freeCount++;
+	    if (freeCount == numKeys) {
+	       return freeStart;
+	    }
+	 }
+      }
+      /* cannot allocate a block of numKeys consecutive keys */
+      return 0;
+   }
+}
+
+
+/**
+ * Return the number of entries in the hash table.
+ */
+GLuint
+_mesa_HashNumEntries(const struct _mesa_HashTable *table)
+{
+   GLuint count = 0;
+
+   if (table->deleted_key_data)
+      count++;
+
+   count += _mesa_hash_table_num_entries(table->ht);
+
+   return count;
+}
diff --git a/src/mesa/main/hash.h b/src/mesa/main/hash.h
new file mode 100644
index 00000000..9c5e9ffe
--- /dev/null
+++ b/src/mesa/main/hash.h
@@ -0,0 +1,191 @@
+/**
+ * \file hash.h
+ * Generic hash table. 
+ */
+
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+#ifndef HASH_H
+#define HASH_H
+
+
+#include <stdbool.h>
+#include <GL/gl.h>
+//#include "imports.h"
+#include "../c11/threads.h"
+
+/**********************************************************************/
+/** Memory macros */
+/*@{*/
+
+/** Allocate a structure of type \p T */
+#define MALLOC_STRUCT(T)   (struct T *) malloc(sizeof(struct T))
+/** Allocate and zero a structure of type \p T */
+#define CALLOC_STRUCT(T)   (struct T *) calloc(1, sizeof(struct T))
+
+/*@}*/
+
+/**
+ * Magic GLuint object name that gets stored outside of the struct hash_table.
+ *
+ * The hash table needs a particular pointer to be the marker for a key that
+ * was deleted from the table, along with NULL for the "never allocated in the
+ * table" marker.  Legacy GL allows any GLuint to be used as a GL object name,
+ * and we use a 1:1 mapping from GLuints to key pointers, so we need to be
+ * able to track a GLuint that happens to match the deleted key outside of
+ * struct hash_table.  We tell the hash table to use "1" as the deleted key
+ * value, so that we test the deleted-key-in-the-table path as best we can.
+ */
+#define DELETED_KEY_VALUE 1
+
+/** @{
+ * Mapping from our use of GLuint as both the key and the hash value to the
+ * hash_table.h API
+ *
+ * There exist many integer hash functions, designed to avoid collisions when
+ * the integers are spread across key space with some patterns.  In GL, the
+ * pattern (in the case of glGen*()ed object IDs) is that the keys are unique
+ * contiguous integers starting from 1.  Because of that, we just use the key
+ * as the hash value, to minimize the cost of the hash function.  If objects
+ * are never deleted, we will never see a collision in the table, because the
+ * table resizes itself when it approaches full, and thus key % table_size ==
+ * key.
+ *
+ * The case where we could have collisions for genned objects would be
+ * something like: glGenBuffers(&a, 100); glDeleteBuffers(&a + 50, 50);
+ * glGenBuffers(&b, 100), because objects 1-50 and 101-200 are allocated at
+ * the end of that sequence, instead of 1-150.  So far it doesn't appear to be
+ * a problem.
+ */
+static inline bool
+uint_key_compare(const void *a, const void *b)
+{
+   return a == b;
+}
+
+static inline uint32_t
+uint_hash(GLuint id)
+{
+   return id;
+}
+
+static inline uint32_t
+uint_key_hash(const void *key)
+{
+   return uint_hash((uintptr_t)key);
+}
+
+static inline void *
+uint_key(GLuint id)
+{
+   return (void *)(uintptr_t) id;
+}
+/** @} */
+
+/**
+ * The hash table data structure.
+ */
+struct _mesa_HashTable {
+   struct hash_table *ht;
+   GLuint MaxKey;                        /**< highest key inserted so far */
+   mtx_t Mutex;                          /**< mutual exclusion lock */
+   GLboolean InDeleteAll;                /**< Debug check */
+   /** Value that would be in the table for DELETED_KEY_VALUE. */
+   void *deleted_key_data;
+};
+
+extern struct _mesa_HashTable *_mesa_NewHashTable(void);
+
+extern void _mesa_DeleteHashTable(struct _mesa_HashTable *table);
+
+extern void *_mesa_HashLookup(struct _mesa_HashTable *table, GLuint key);
+
+extern void _mesa_HashInsert(struct _mesa_HashTable *table, GLuint key, void *data);
+
+extern void _mesa_HashRemove(struct _mesa_HashTable *table, GLuint key);
+
+/**
+ * Lock the hash table mutex.
+ *
+ * This function should be used when multiple objects need
+ * to be looked up in the hash table, to avoid having to lock
+ * and unlock the mutex each time.
+ *
+ * \param table the hash table.
+ */
+static inline void
+_mesa_HashLockMutex(struct _mesa_HashTable *table)
+{
+   assert(table);
+   mtx_lock(&table->Mutex);
+}
+
+
+/**
+ * Unlock the hash table mutex.
+ *
+ * \param table the hash table.
+ */
+static inline void
+_mesa_HashUnlockMutex(struct _mesa_HashTable *table)
+{
+   assert(table);
+   mtx_unlock(&table->Mutex);
+}
+
+extern void *_mesa_HashLookupLocked(struct _mesa_HashTable *table, GLuint key);
+
+extern void _mesa_HashInsertLocked(struct _mesa_HashTable *table,
+                                   GLuint key, void *data);
+
+extern void _mesa_HashRemoveLocked(struct _mesa_HashTable *table, GLuint key);
+
+extern void
+_mesa_HashDeleteAll(struct _mesa_HashTable *table,
+                    void (*callback)(GLuint key, void *data, void *userData),
+                    void *userData);
+
+extern void
+_mesa_HashWalk(const struct _mesa_HashTable *table,
+               void (*callback)(GLuint key, void *data, void *userData),
+               void *userData);
+
+extern void
+_mesa_HashWalkLocked(const struct _mesa_HashTable *table,
+                     void (*callback)(GLuint key, void *data, void *userData),
+                     void *userData);
+
+extern void _mesa_HashPrint(const struct _mesa_HashTable *table);
+
+extern GLuint _mesa_HashFindFreeKeyBlock(struct _mesa_HashTable *table, GLuint numKeys);
+
+extern GLuint
+_mesa_HashNumEntries(const struct _mesa_HashTable *table);
+
+extern void _mesa_test_hash_functions(void);
+
+
+#endif
diff --git a/src/mesa/no_extern_c.h b/src/mesa/no_extern_c.h
new file mode 100644
index 00000000..f79602c0
--- /dev/null
+++ b/src/mesa/no_extern_c.h
@@ -0,0 +1,48 @@
+/**************************************************************************
+ *
+ * Copyright 2014 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+/*
+ * Including system's headers inside `extern "C" { ... }` is not safe, as system
+ * headers may have C++ code in them, and C++ code inside extern "C"
+ * leads to syntatically incorrect code.
+ *
+ * This is because putting code inside extern "C" won't make __cplusplus define
+ * go away, that is, the system header being included thinks is free to use C++
+ * as it sees fits.
+ *
+ * Including non-system headers inside extern "C"  is not safe either, because
+ * non-system headers end up including system headers, hence fall in the above
+ * case too.
+ *
+ * Conclusion, includes inside extern "C" is simply not portable.
+ *
+ *
+ * This header helps surface these issues.
+ */
+
+#ifdef __cplusplus
+template<class T> class _IncludeInsideExternCNotPortable;
+#endif
diff --git a/src/mesa/util/detect_os.h b/src/mesa/util/detect_os.h
new file mode 100644
index 00000000..6506948e
--- /dev/null
+++ b/src/mesa/util/detect_os.h
@@ -0,0 +1,131 @@
+/* SPDX-License-Identifier: MIT */
+/* Copyright 2008 VMware, Inc. */
+
+/**
+ * Auto-detect the operating system family.
+ *
+ * See also:
+ * - http://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html
+ * - echo | gcc -dM -E - | sort
+ * - http://msdn.microsoft.com/en-us/library/b0084kay.aspx
+ *
+ * @author José Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef DETECT_OS_H
+#define DETECT_OS_H
+
+#if defined(__linux__)
+#define DETECT_OS_LINUX 1
+#define DETECT_OS_UNIX 1
+#endif
+
+/*
+ * Android defines __linux__, so DETECT_OS_LINUX and DETECT_OS_UNIX will
+ * also be defined.
+ */
+#if defined(ANDROID)
+#define DETECT_OS_ANDROID 1
+#endif
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+#define DETECT_OS_FREEBSD 1
+#define DETECT_OS_BSD 1
+#define DETECT_OS_UNIX 1
+#endif
+
+#if defined(__OpenBSD__)
+#define DETECT_OS_OPENBSD 1
+#define DETECT_OS_BSD 1
+#define DETECT_OS_UNIX 1
+#endif
+
+#if defined(__NetBSD__)
+#define DETECT_OS_NETBSD 1
+#define DETECT_OS_BSD 1
+#define DETECT_OS_UNIX 1
+#endif
+
+#if defined(__DragonFly__)
+#define DETECT_OS_DRAGONFLY 1
+#define DETECT_OS_BSD 1
+#define DETECT_OS_UNIX 1
+#endif
+
+#if defined(__GNU__)
+#define DETECT_OS_HURD 1
+#define DETECT_OS_UNIX 1
+#endif
+
+#if defined(__sun)
+#define DETECT_OS_SOLARIS 1
+#define DETECT_OS_UNIX 1
+#endif
+
+#if defined(__APPLE__)
+#define DETECT_OS_APPLE 1
+#define DETECT_OS_UNIX 1
+#endif
+
+#if defined(_WIN32) || defined(WIN32)
+#define DETECT_OS_WINDOWS 1
+#endif
+
+#if defined(__HAIKU__)
+#define DETECT_OS_HAIKU 1
+#define DETECT_OS_UNIX 1
+#endif
+
+#if defined(__CYGWIN__)
+#define DETECT_OS_CYGWIN 1
+#define DETECT_OS_UNIX 1
+#endif
+
+
+/*
+ * Make sure DETECT_OS_* are always defined, so that they can be used with #if
+ */
+#ifndef DETECT_OS_ANDROID
+#define DETECT_OS_ANDROID 0
+#endif
+#ifndef DETECT_OS_APPLE
+#define DETECT_OS_APPLE 0
+#endif
+#ifndef DETECT_OS_BSD
+#define DETECT_OS_BSD 0
+#endif
+#ifndef DETECT_OS_CYGWIN
+#define DETECT_OS_CYGWIN 0
+#endif
+#ifndef DETECT_OS_DRAGONFLY
+#define DETECT_OS_DRAGONFLY 0
+#endif
+#ifndef DETECT_OS_FREEBSD
+#define DETECT_OS_FREEBSD 0
+#endif
+#ifndef DETECT_OS_HAIKU
+#define DETECT_OS_HAIKU 0
+#endif
+#ifndef DETECT_OS_HURD
+#define DETECT_OS_HURD 0
+#endif
+#ifndef DETECT_OS_LINUX
+#define DETECT_OS_LINUX 0
+#endif
+#ifndef DETECT_OS_NETBSD
+#define DETECT_OS_NETBSD 0
+#endif
+#ifndef DETECT_OS_OPENBSD
+#define DETECT_OS_OPENBSD 0
+#endif
+#ifndef DETECT_OS_SOLARIS
+#define DETECT_OS_SOLARIS 0
+#endif
+#ifndef DETECT_OS_UNIX
+#define DETECT_OS_UNIX 0
+#endif
+#ifndef DETECT_OS_WINDOWS
+#define DETECT_OS_WINDOWS 0
+#endif
+
+#endif /* DETECT_OS_H */
diff --git a/src/mesa/util/futex.h b/src/mesa/util/futex.h
new file mode 100644
index 00000000..cf8dd020
--- /dev/null
+++ b/src/mesa/util/futex.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright © 2015 Intel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef UTIL_FUTEX_H
+#define UTIL_FUTEX_H
+
+#if defined(HAVE_LINUX_FUTEX_H)
+
+#include <limits.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+
+static inline long sys_futex(void *addr1, int op, int val1, const struct timespec *timeout, void *addr2, int val3)
+{
+   return syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
+}
+
+static inline int futex_wake(uint32_t *addr, int count)
+{
+   return sys_futex(addr, FUTEX_WAKE, count, NULL, NULL, 0);
+}
+
+static inline int futex_wait(uint32_t *addr, int32_t value, const struct timespec *timeout)
+{
+   /* FUTEX_WAIT_BITSET with FUTEX_BITSET_MATCH_ANY is equivalent to
+    * FUTEX_WAIT, except that it treats the timeout as absolute. */
+   return sys_futex(addr, FUTEX_WAIT_BITSET, value, timeout, NULL,
+                    FUTEX_BITSET_MATCH_ANY);
+}
+
+#elif defined(__FreeBSD__)
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/umtx.h>
+#include <sys/time.h>
+
+static inline int futex_wake(uint32_t *addr, int count)
+{
+   assert(count == (int)(uint32_t)count); /* Check that bits weren't discarded */
+   return _umtx_op(addr, UMTX_OP_WAKE, (uint32_t)count, NULL, NULL) == -1 ? errno : 0;
+}
+
+static inline int futex_wait(uint32_t *addr, int32_t value, struct timespec *timeout)
+{
+   void *uaddr = NULL, *uaddr2 = NULL;
+   struct _umtx_time tmo = {
+      ._flags = UMTX_ABSTIME,
+      ._clockid = CLOCK_MONOTONIC
+   };
+
+   assert(value == (int)(uint32_t)value); /* Check that bits weren't discarded */
+
+   if (timeout != NULL) {
+      tmo._timeout = *timeout;
+      uaddr = (void *)(uintptr_t)sizeof(tmo);
+      uaddr2 = (void *)&tmo;
+   }
+
+   return _umtx_op(addr, UMTX_OP_WAIT_UINT, (uint32_t)value, uaddr, uaddr2) == -1 ? errno : 0;
+}
+
+#elif defined(__OpenBSD__)
+
+#include <sys/time.h>
+#include <sys/futex.h>
+
+static inline int futex_wake(uint32_t *addr, int count)
+{
+   return futex(addr, FUTEX_WAKE, count, NULL, NULL);
+}
+
+static inline int futex_wait(uint32_t *addr, int32_t value, const struct timespec *timeout)
+{
+   struct timespec tsrel, tsnow;
+   clock_gettime(CLOCK_MONOTONIC, &tsnow); 
+   timespecsub(timeout, &tsrel, &tsrel);
+   return futex(addr, FUTEX_WAIT, value, &tsrel, NULL);
+}
+
+#endif
+
+#endif /* UTIL_FUTEX_H */
diff --git a/src/mesa/util/hash_table.c b/src/mesa/util/hash_table.c
new file mode 100644
index 00000000..aee3f5ee
--- /dev/null
+++ b/src/mesa/util/hash_table.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright © 2009,2012 Intel Corporation
+ * Copyright © 1988-2004 Keith Packard and Bart Massey.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the names of the authors
+ * or their institutions shall not be used in advertising or
+ * otherwise to promote the sale, use or other dealings in this
+ * Software without prior written authorization from the
+ * authors.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *    Keith Packard <keithp@keithp.com>
+ */
+
+/**
+ * Implements an open-addressing, linear-reprobing hash table.
+ *
+ * For more information, see:
+ *
+ * http://cgit.freedesktop.org/~anholt/hash_table/tree/README
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include "hash_table.h"
+#include "ralloc.h"
+#include "macros.h"
+#include "../main/hash.h"
+#include "../fast_urem_by_const.h"
+
+static const uint32_t deleted_key_value;
+
+/**
+ * From Knuth -- a good choice for hash/rehash values is p, p-2 where
+ * p and p-2 are both prime.  These tables are sized to have an extra 10%
+ * free to avoid exponential performance degradation as the hash table fills
+ */
+static const struct {
+   uint32_t max_entries, size, rehash;
+   uint64_t size_magic, rehash_magic;
+} hash_sizes[] = {
+#define ENTRY(max_entries, size, rehash) \
+   { max_entries, size, rehash, \
+      REMAINDER_MAGIC(size), REMAINDER_MAGIC(rehash) }
+
+   ENTRY(2,            5,            3            ),
+   ENTRY(4,            7,            5            ),
+   ENTRY(8,            13,           11           ),
+   ENTRY(16,           19,           17           ),
+   ENTRY(32,           43,           41           ),
+   ENTRY(64,           73,           71           ),
+   ENTRY(128,          151,          149          ),
+   ENTRY(256,          283,          281          ),
+   ENTRY(512,          571,          569          ),
+   ENTRY(1024,         1153,         1151         ),
+   ENTRY(2048,         2269,         2267         ),
+   ENTRY(4096,         4519,         4517         ),
+   ENTRY(8192,         9013,         9011         ),
+   ENTRY(16384,        18043,        18041        ),
+   ENTRY(32768,        36109,        36107        ),
+   ENTRY(65536,        72091,        72089        ),
+   ENTRY(131072,       144409,       144407       ),
+   ENTRY(262144,       288361,       288359       ),
+   ENTRY(524288,       576883,       576881       ),
+   ENTRY(1048576,      1153459,      1153457      ),
+   ENTRY(2097152,      2307163,      2307161      ),
+   ENTRY(4194304,      4613893,      4613891      ),
+   ENTRY(8388608,      9227641,      9227639      ),
+   ENTRY(16777216,     18455029,     18455027     ),
+   ENTRY(33554432,     36911011,     36911009     ),
+   ENTRY(67108864,     73819861,     73819859     ),
+   ENTRY(134217728,    147639589,    147639587    ),
+   ENTRY(268435456,    295279081,    295279079    ),
+   ENTRY(536870912,    590559793,    590559791    ),
+   ENTRY(1073741824,   1181116273,   1181116271   ),
+   ENTRY(2147483648ul, 2362232233ul, 2362232231ul )
+};
+
+static inline bool
+key_pointer_is_reserved(const struct hash_table *ht, const void *key)
+{
+   return key == NULL || key == ht->deleted_key;
+}
+
+static int
+entry_is_free(const struct hash_entry *entry)
+{
+   return entry->key == NULL;
+}
+
+static int
+entry_is_deleted(const struct hash_table *ht, struct hash_entry *entry)
+{
+   return entry->key == ht->deleted_key;
+}
+
+static int
+entry_is_present(const struct hash_table *ht, struct hash_entry *entry)
+{
+   return entry->key != NULL && entry->key != ht->deleted_key;
+}
+
+bool
+_mesa_hash_table_init(struct hash_table *ht,
+                      void *mem_ctx,
+                      uint32_t (*key_hash_function)(const void *key),
+                      bool (*key_equals_function)(const void *a,
+                                                  const void *b))
+{
+   ht->size_index = 0;
+   ht->size = hash_sizes[ht->size_index].size;
+   ht->rehash = hash_sizes[ht->size_index].rehash;
+   ht->size_magic = hash_sizes[ht->size_index].size_magic;
+   ht->rehash_magic = hash_sizes[ht->size_index].rehash_magic;
+   ht->max_entries = hash_sizes[ht->size_index].max_entries;
+   ht->key_hash_function = key_hash_function;
+   ht->key_equals_function = key_equals_function;
+   ht->table = rzalloc_array(mem_ctx, struct hash_entry, ht->size);
+   ht->entries = 0;
+   ht->deleted_entries = 0;
+   ht->deleted_key = &deleted_key_value;
+
+   return ht->table != NULL;
+}
+
+struct hash_table *
+_mesa_hash_table_create(void *mem_ctx,
+                        uint32_t (*key_hash_function)(const void *key),
+                        bool (*key_equals_function)(const void *a,
+                                                    const void *b))
+{
+   struct hash_table *ht;
+
+   /* mem_ctx is used to allocate the hash table, but the hash table is used
+    * to allocate all of the suballocations.
+    */
+   ht = ralloc(mem_ctx, struct hash_table);
+   if (ht == NULL)
+      return NULL;
+
+   if (!_mesa_hash_table_init(ht, ht, key_hash_function, key_equals_function)) {
+      ralloc_free(ht);
+      return NULL;
+   }
+
+   return ht;
+}
+
+struct hash_table *
+_mesa_hash_table_clone(struct hash_table *src, void *dst_mem_ctx)
+{
+   struct hash_table *ht;
+
+   ht = ralloc(dst_mem_ctx, struct hash_table);
+   if (ht == NULL)
+      return NULL;
+
+   memcpy(ht, src, sizeof(struct hash_table));
+
+   ht->table = ralloc_array(ht, struct hash_entry, ht->size);
+   if (ht->table == NULL) {
+      ralloc_free(ht);
+      return NULL;
+   }
+
+   memcpy(ht->table, src->table, ht->size * sizeof(struct hash_entry));
+
+   return ht;
+}
+
+/**
+ * Frees the given hash table.
+ *
+ * If delete_function is passed, it gets called on each entry present before
+ * freeing.
+ */
+void
+_mesa_hash_table_destroy(struct hash_table *ht,
+                         void (*delete_function)(struct hash_entry *entry))
+{
+   if (!ht)
+      return;
+
+   if (delete_function) {
+      hash_table_foreach(ht, entry) {
+         delete_function(entry);
+      }
+   }
+   ralloc_free(ht);
+}
+
+/**
+ * Deletes all entries of the given hash table without deleting the table
+ * itself or changing its structure.
+ *
+ * If delete_function is passed, it gets called on each entry present.
+ */
+void
+_mesa_hash_table_clear(struct hash_table *ht,
+                       void (*delete_function)(struct hash_entry *entry))
+{
+   struct hash_entry *entry;
+
+   for (entry = ht->table; entry != ht->table + ht->size; entry++) {
+      if (entry->key == NULL)
+         continue;
+
+      if (delete_function != NULL && entry->key != ht->deleted_key)
+         delete_function(entry);
+
+      entry->key = NULL;
+   }
+
+   ht->entries = 0;
+   ht->deleted_entries = 0;
+}
+
+/** Sets the value of the key pointer used for deleted entries in the table.
+ *
+ * The assumption is that usually keys are actual pointers, so we use a
+ * default value of a pointer to an arbitrary piece of storage in the library.
+ * But in some cases a consumer wants to store some other sort of value in the
+ * table, like a uint32_t, in which case that pointer may conflict with one of
+ * their valid keys.  This lets that user select a safe value.
+ *
+ * This must be called before any keys are actually deleted from the table.
+ */
+void
+_mesa_hash_table_set_deleted_key(struct hash_table *ht, const void *deleted_key)
+{
+   ht->deleted_key = deleted_key;
+}
+
+static struct hash_entry *
+hash_table_search(struct hash_table *ht, uint32_t hash, const void *key)
+{
+   assert(!key_pointer_is_reserved(ht, key));
+
+   uint32_t size = ht->size;
+   uint32_t start_hash_address = util_fast_urem32(hash, size, ht->size_magic);
+   uint32_t double_hash = 1 + util_fast_urem32(hash, ht->rehash,
+                                               ht->rehash_magic);
+   uint32_t hash_address = start_hash_address;
+
+   do {
+      struct hash_entry *entry = ht->table + hash_address;
+
+      if (entry_is_free(entry)) {
+         return NULL;
+      } else if (entry_is_present(ht, entry) && entry->hash == hash) {
+         if (ht->key_equals_function(key, entry->key)) {
+            return entry;
+         }
+      }
+
+      hash_address += double_hash;
+      if (hash_address >= size)
+         hash_address -= size;
+   } while (hash_address != start_hash_address);
+
+   return NULL;
+}
+
+/**
+ * Finds a hash table entry with the given key and hash of that key.
+ *
+ * Returns NULL if no entry is found.  Note that the data pointer may be
+ * modified by the user.
+ */
+struct hash_entry *
+_mesa_hash_table_search(struct hash_table *ht, const void *key)
+{
+   assert(ht->key_hash_function);
+   return hash_table_search(ht, ht->key_hash_function(key), key);
+}
+
+struct hash_entry *
+_mesa_hash_table_search_pre_hashed(struct hash_table *ht, uint32_t hash,
+                                  const void *key)
+{
+   assert(ht->key_hash_function == NULL || hash == ht->key_hash_function(key));
+   return hash_table_search(ht, hash, key);
+}
+
+static struct hash_entry *
+hash_table_insert(struct hash_table *ht, uint32_t hash,
+                  const void *key, void *data);
+
+static void
+hash_table_insert_rehash(struct hash_table *ht, uint32_t hash,
+                         const void *key, void *data)
+{
+   uint32_t size = ht->size;
+   uint32_t start_hash_address = util_fast_urem32(hash, size, ht->size_magic);
+   uint32_t double_hash = 1 + util_fast_urem32(hash, ht->rehash,
+                                               ht->rehash_magic);
+   uint32_t hash_address = start_hash_address;
+   do {
+      struct hash_entry *entry = ht->table + hash_address;
+
+      if (likely(entry->key == NULL)) {
+         entry->hash = hash;
+         entry->key = key;
+         entry->data = data;
+         return;
+      }
+
+      hash_address += double_hash;
+      if (hash_address >= size)
+         hash_address -= size;
+   } while (true);
+}
+
+static void
+_mesa_hash_table_rehash(struct hash_table *ht, unsigned new_size_index)
+{
+   struct hash_table old_ht;
+   struct hash_entry *table;
+
+   if (new_size_index >= ARRAY_SIZE(hash_sizes))
+      return;
+
+   table = rzalloc_array(ralloc_parent(ht->table), struct hash_entry,
+                         hash_sizes[new_size_index].size);
+   if (table == NULL)
+      return;
+
+   old_ht = *ht;
+
+   ht->table = table;
+   ht->size_index = new_size_index;
+   ht->size = hash_sizes[ht->size_index].size;
+   ht->rehash = hash_sizes[ht->size_index].rehash;
+   ht->size_magic = hash_sizes[ht->size_index].size_magic;
+   ht->rehash_magic = hash_sizes[ht->size_index].rehash_magic;
+   ht->max_entries = hash_sizes[ht->size_index].max_entries;
+   ht->entries = 0;
+   ht->deleted_entries = 0;
+
+   hash_table_foreach(&old_ht, entry) {
+      hash_table_insert_rehash(ht, entry->hash, entry->key, entry->data);
+   }
+
+   ht->entries = old_ht.entries;
+
+   ralloc_free(old_ht.table);
+}
+
+static struct hash_entry *
+hash_table_insert(struct hash_table *ht, uint32_t hash,
+                  const void *key, void *data)
+{
+   struct hash_entry *available_entry = NULL;
+
+   assert(!key_pointer_is_reserved(ht, key));
+
+   if (ht->entries >= ht->max_entries) {
+      _mesa_hash_table_rehash(ht, ht->size_index + 1);
+   } else if (ht->deleted_entries + ht->entries >= ht->max_entries) {
+      _mesa_hash_table_rehash(ht, ht->size_index);
+   }
+
+   uint32_t size = ht->size;
+   uint32_t start_hash_address = util_fast_urem32(hash, size, ht->size_magic);
+   uint32_t double_hash = 1 + util_fast_urem32(hash, ht->rehash,
+                                               ht->rehash_magic);
+   uint32_t hash_address = start_hash_address;
+   do {
+      struct hash_entry *entry = ht->table + hash_address;
+
+      if (!entry_is_present(ht, entry)) {
+         /* Stash the first available entry we find */
+         if (available_entry == NULL)
+            available_entry = entry;
+         if (entry_is_free(entry))
+            break;
+      }
+
+      /* Implement replacement when another insert happens
+       * with a matching key.  This is a relatively common
+       * feature of hash tables, with the alternative
+       * generally being "insert the new value as well, and
+       * return it first when the key is searched for".
+       *
+       * Note that the hash table doesn't have a delete
+       * callback.  If freeing of old data pointers is
+       * required to avoid memory leaks, perform a search
+       * before inserting.
+       */
+      if (!entry_is_deleted(ht, entry) &&
+          entry->hash == hash &&
+          ht->key_equals_function(key, entry->key)) {
+         entry->key = key;
+         entry->data = data;
+         return entry;
+      }
+
+      hash_address += double_hash;
+      if (hash_address >= size)
+         hash_address -= size;
+   } while (hash_address != start_hash_address);
+
+   if (available_entry) {
+      if (entry_is_deleted(ht, available_entry))
+         ht->deleted_entries--;
+      available_entry->hash = hash;
+      available_entry->key = key;
+      available_entry->data = data;
+      ht->entries++;
+      return available_entry;
+   }
+
+   /* We could hit here if a required resize failed. An unchecked-malloc
+    * application could ignore this result.
+    */
+   return NULL;
+}
+
+/**
+ * Inserts the key with the given hash into the table.
+ *
+ * Note that insertion may rearrange the table on a resize or rehash,
+ * so previously found hash_entries are no longer valid after this function.
+ */
+struct hash_entry *
+_mesa_hash_table_insert(struct hash_table *ht, const void *key, void *data)
+{
+   assert(ht->key_hash_function);
+   return hash_table_insert(ht, ht->key_hash_function(key), key, data);
+}
+
+struct hash_entry *
+_mesa_hash_table_insert_pre_hashed(struct hash_table *ht, uint32_t hash,
+                                   const void *key, void *data)
+{
+   assert(ht->key_hash_function == NULL || hash == ht->key_hash_function(key));
+   return hash_table_insert(ht, hash, key, data);
+}
+
+/**
+ * This function deletes the given hash table entry.
+ *
+ * Note that deletion doesn't otherwise modify the table, so an iteration over
+ * the table deleting entries is safe.
+ */
+void
+_mesa_hash_table_remove(struct hash_table *ht,
+                        struct hash_entry *entry)
+{
+   if (!entry)
+      return;
+
+   entry->key = ht->deleted_key;
+   ht->entries--;
+   ht->deleted_entries++;
+}
+
+/**
+ * Removes the entry with the corresponding key, if exists.
+ */
+void _mesa_hash_table_remove_key(struct hash_table *ht,
+                                 const void *key)
+{
+   _mesa_hash_table_remove(ht, _mesa_hash_table_search(ht, key));
+}
+
+/**
+ * This function is an iterator over the hash table.
+ *
+ * Pass in NULL for the first entry, as in the start of a for loop.  Note that
+ * an iteration over the table is O(table_size) not O(entries).
+ */
+struct hash_entry *
+_mesa_hash_table_next_entry(struct hash_table *ht,
+                            struct hash_entry *entry)
+{
+   if (entry == NULL)
+      entry = ht->table;
+   else
+      entry = entry + 1;
+
+   for (; entry != ht->table + ht->size; entry++) {
+      if (entry_is_present(ht, entry)) {
+         return entry;
+      }
+   }
+
+   return NULL;
+}
+
+/**
+ * Returns a random entry from the hash table.
+ *
+ * This may be useful in implementing random replacement (as opposed
+ * to just removing everything) in caches based on this hash table
+ * implementation.  @predicate may be used to filter entries, or may
+ * be set to NULL for no filtering.
+ */
+struct hash_entry *
+_mesa_hash_table_random_entry(struct hash_table *ht,
+                              bool (*predicate)(struct hash_entry *entry))
+{
+   struct hash_entry *entry;
+   uint32_t i = rand() % ht->size;
+
+   if (ht->entries == 0)
+      return NULL;
+
+   for (entry = ht->table + i; entry != ht->table + ht->size; entry++) {
+      if (entry_is_present(ht, entry) &&
+          (!predicate || predicate(entry))) {
+         return entry;
+      }
+   }
+
+   for (entry = ht->table; entry != ht->table + i; entry++) {
+      if (entry_is_present(ht, entry) &&
+          (!predicate || predicate(entry))) {
+         return entry;
+      }
+   }
+
+   return NULL;
+}
+
+
+/**
+ * Quick FNV-1a hash implementation based on:
+ * http://www.isthe.com/chongo/tech/comp/fnv/
+ *
+ * FNV-1a is not be the best hash out there -- Jenkins's lookup3 is supposed
+ * to be quite good, and it probably beats FNV.  But FNV has the advantage
+ * that it involves almost no code.  For an improvement on both, see Paul
+ * Hsieh's http://www.azillionmonkeys.com/qed/hash.html
+ */
+uint32_t
+_mesa_hash_data(const void *data, size_t size)
+{
+   return _mesa_fnv32_1a_accumulate_block(_mesa_fnv32_1a_offset_bias,
+                                          data, size);
+}
+
+/** FNV-1a string hash implementation */
+uint32_t
+_mesa_hash_string(const void *_key)
+{
+   uint32_t hash = _mesa_fnv32_1a_offset_bias;
+   const char *key = _key;
+
+   while (*key != 0) {
+      hash = _mesa_fnv32_1a_accumulate(hash, *key);
+      key++;
+   }
+
+   return hash;
+}
+
+/**
+ * String compare function for use as the comparison callback in
+ * _mesa_hash_table_create().
+ */
+bool
+_mesa_key_string_equal(const void *a, const void *b)
+{
+   return strcmp(a, b) == 0;
+}
+
+bool
+_mesa_key_pointer_equal(const void *a, const void *b)
+{
+   return a == b;
+}
+
+/**
+ * Helper to create a hash table with pointer keys.
+ */
+struct hash_table *
+_mesa_pointer_hash_table_create(void *mem_ctx)
+{
+   return _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
+                                  _mesa_key_pointer_equal);
+}
+
+/**
+ * Hash table wrapper which supports 64-bit keys.
+ *
+ * TODO: unify all hash table implementations.
+ */
+
+struct hash_key_u64 {
+   uint64_t value;
+};
+
+static uint32_t
+key_u64_hash(const void *key)
+{
+   return _mesa_hash_data(key, sizeof(struct hash_key_u64));
+}
+
+static bool
+key_u64_equals(const void *a, const void *b)
+{
+   const struct hash_key_u64 *aa = a;
+   const struct hash_key_u64 *bb = b;
+
+   return aa->value == bb->value;
+}
+
+#define FREED_KEY_VALUE 0
+
+struct hash_table_u64 *
+_mesa_hash_table_u64_create(void *mem_ctx)
+{
+   STATIC_ASSERT(FREED_KEY_VALUE != DELETED_KEY_VALUE);
+   struct hash_table_u64 *ht;
+
+   ht = CALLOC_STRUCT(hash_table_u64);
+   if (!ht)
+      return NULL;
+
+   if (sizeof(void *) == 8) {
+      ht->table = _mesa_hash_table_create(mem_ctx, _mesa_hash_pointer,
+                                          _mesa_key_pointer_equal);
+   } else {
+      ht->table = _mesa_hash_table_create(mem_ctx, key_u64_hash,
+                                          key_u64_equals);
+   }
+
+   if (ht->table)
+      _mesa_hash_table_set_deleted_key(ht->table, uint_key(DELETED_KEY_VALUE));
+
+   return ht;
+}
+
+void
+_mesa_hash_table_u64_clear(struct hash_table_u64 *ht,
+                           void (*delete_function)(struct hash_entry *entry))
+{
+   if (!ht)
+      return;
+
+   if (ht->deleted_key_data) {
+      if (delete_function) {
+         struct hash_table *table = ht->table;
+         struct hash_entry entry;
+
+         /* Create a fake entry for the delete function. */
+         if (sizeof(void *) == 8) {
+            entry.hash = table->key_hash_function(table->deleted_key);
+         } else {
+            struct hash_key_u64 _key = { .value = (uintptr_t)table->deleted_key };
+            entry.hash = table->key_hash_function(&_key);
+         }
+         entry.key = table->deleted_key;
+         entry.data = ht->deleted_key_data;
+
+         delete_function(&entry);
+      }
+      ht->deleted_key_data = NULL;
+   }
+
+   if (ht->freed_key_data) {
+      if (delete_function) {
+         struct hash_table *table = ht->table;
+         struct hash_entry entry;
+
+         /* Create a fake entry for the delete function. */
+         if (sizeof(void *) == 8) {
+            entry.hash = table->key_hash_function(uint_key(FREED_KEY_VALUE));
+         } else {
+            struct hash_key_u64 _key = { .value = (uintptr_t)FREED_KEY_VALUE };
+            entry.hash = table->key_hash_function(&_key);
+         }
+         entry.key = uint_key(FREED_KEY_VALUE);
+         entry.data = ht->freed_key_data;
+
+         delete_function(&entry);
+      }
+      ht->freed_key_data = NULL;
+   }
+
+   _mesa_hash_table_clear(ht->table, delete_function);
+}
+
+void
+_mesa_hash_table_u64_destroy(struct hash_table_u64 *ht,
+                             void (*delete_function)(struct hash_entry *entry))
+{
+   if (!ht)
+      return;
+
+   _mesa_hash_table_u64_clear(ht, delete_function);
+   _mesa_hash_table_destroy(ht->table, delete_function);
+   free(ht);
+}
+
+void
+_mesa_hash_table_u64_insert(struct hash_table_u64 *ht, uint64_t key,
+                            void *data)
+{
+   if (key == FREED_KEY_VALUE) {
+      ht->freed_key_data = data;
+      return;
+   }
+
+   if (key == DELETED_KEY_VALUE) {
+      ht->deleted_key_data = data;
+      return;
+   }
+
+   if (sizeof(void *) == 8) {
+      _mesa_hash_table_insert(ht->table, (void *)(uintptr_t)key, data);
+   } else {
+      struct hash_key_u64 *_key = CALLOC_STRUCT(hash_key_u64);
+
+      if (!_key)
+         return;
+      _key->value = key;
+
+      _mesa_hash_table_insert(ht->table, _key, data);
+   }
+}
+
+static struct hash_entry *
+hash_table_u64_search(struct hash_table_u64 *ht, uint64_t key)
+{
+   if (sizeof(void *) == 8) {
+      return _mesa_hash_table_search(ht->table, (void *)(uintptr_t)key);
+   } else {
+      struct hash_key_u64 _key = { .value = key };
+      return _mesa_hash_table_search(ht->table, &_key);
+   }
+}
+
+void *
+_mesa_hash_table_u64_search(struct hash_table_u64 *ht, uint64_t key)
+{
+   struct hash_entry *entry;
+
+   if (key == FREED_KEY_VALUE)
+      return ht->freed_key_data;
+
+   if (key == DELETED_KEY_VALUE)
+      return ht->deleted_key_data;
+
+   entry = hash_table_u64_search(ht, key);
+   if (!entry)
+      return NULL;
+
+   return entry->data;
+}
+
+void
+_mesa_hash_table_u64_remove(struct hash_table_u64 *ht, uint64_t key)
+{
+   struct hash_entry *entry;
+
+   if (key == FREED_KEY_VALUE) {
+      ht->freed_key_data = NULL;
+      return;
+   }
+
+   if (key == DELETED_KEY_VALUE) {
+      ht->deleted_key_data = NULL;
+      return;
+   }
+
+   entry = hash_table_u64_search(ht, key);
+   if (!entry)
+      return;
+
+   if (sizeof(void *) == 8) {
+      _mesa_hash_table_remove(ht->table, entry);
+   } else {
+      struct hash_key *_key = (struct hash_key *)entry->key;
+
+      _mesa_hash_table_remove(ht->table, entry);
+      free(_key);
+   }
+}
diff --git a/src/mesa/util/hash_table.h b/src/mesa/util/hash_table.h
new file mode 100644
index 00000000..8f1d6860
--- /dev/null
+++ b/src/mesa/util/hash_table.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright © 2009,2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#ifndef _HASH_TABLE_H
+#define _HASH_TABLE_H
+
+#include <stdlib.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include "../c99_compat.h"
+#include "macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct hash_entry {
+   uint32_t hash;
+   const void *key;
+   void *data;
+};
+
+struct hash_table {
+   struct hash_entry *table;
+   uint32_t (*key_hash_function)(const void *key);
+   bool (*key_equals_function)(const void *a, const void *b);
+   const void *deleted_key;
+   uint32_t size;
+   uint32_t rehash;
+   uint64_t size_magic;
+   uint64_t rehash_magic;
+   uint32_t max_entries;
+   uint32_t size_index;
+   uint32_t entries;
+   uint32_t deleted_entries;
+};
+
+struct hash_table *
+_mesa_hash_table_create(void *mem_ctx,
+                        uint32_t (*key_hash_function)(const void *key),
+                        bool (*key_equals_function)(const void *a,
+                                                    const void *b));
+
+bool
+_mesa_hash_table_init(struct hash_table *ht,
+                      void *mem_ctx,
+                      uint32_t (*key_hash_function)(const void *key),
+                      bool (*key_equals_function)(const void *a,
+                                                  const void *b));
+
+struct hash_table *
+_mesa_hash_table_clone(struct hash_table *src, void *dst_mem_ctx);
+void _mesa_hash_table_destroy(struct hash_table *ht,
+                              void (*delete_function)(struct hash_entry *entry));
+void _mesa_hash_table_clear(struct hash_table *ht,
+                            void (*delete_function)(struct hash_entry *entry));
+void _mesa_hash_table_set_deleted_key(struct hash_table *ht,
+                                      const void *deleted_key);
+
+static inline uint32_t _mesa_hash_table_num_entries(struct hash_table *ht)
+{
+   return ht->entries;
+}
+
+struct hash_entry *
+_mesa_hash_table_insert(struct hash_table *ht, const void *key, void *data);
+struct hash_entry *
+_mesa_hash_table_insert_pre_hashed(struct hash_table *ht, uint32_t hash,
+                                   const void *key, void *data);
+struct hash_entry *
+_mesa_hash_table_search(struct hash_table *ht, const void *key);
+struct hash_entry *
+_mesa_hash_table_search_pre_hashed(struct hash_table *ht, uint32_t hash,
+                                  const void *key);
+void _mesa_hash_table_remove(struct hash_table *ht,
+                             struct hash_entry *entry);
+void _mesa_hash_table_remove_key(struct hash_table *ht,
+                                 const void *key);
+
+struct hash_entry *_mesa_hash_table_next_entry(struct hash_table *ht,
+                                               struct hash_entry *entry);
+struct hash_entry *
+_mesa_hash_table_random_entry(struct hash_table *ht,
+                              bool (*predicate)(struct hash_entry *entry));
+
+uint32_t _mesa_hash_data(const void *data, size_t size);
+uint32_t _mesa_hash_string(const void *key);
+bool _mesa_key_string_equal(const void *a, const void *b);
+bool _mesa_key_pointer_equal(const void *a, const void *b);
+
+static inline uint32_t _mesa_key_hash_string(const void *key)
+{
+   return _mesa_hash_string((const char *)key);
+}
+
+static inline uint32_t _mesa_hash_pointer(const void *pointer)
+{
+   uintptr_t num = (uintptr_t) pointer;
+   return (uint32_t) ((num >> 2) ^ (num >> 6) ^ (num >> 10) ^ (num >> 14));
+}
+
+struct hash_table *
+_mesa_pointer_hash_table_create(void *mem_ctx);
+
+enum {
+   _mesa_fnv32_1a_offset_bias = 2166136261u,
+};
+
+static inline uint32_t
+_mesa_fnv32_1a_accumulate_block(uint32_t hash, const void *data, size_t size)
+{
+   const uint8_t *bytes = (const uint8_t *)data;
+
+   while (size-- != 0) {
+      hash ^= *bytes;
+      hash = hash * 0x01000193;
+      bytes++;
+   }
+
+   return hash;
+}
+
+#define _mesa_fnv32_1a_accumulate(hash, expr) \
+   _mesa_fnv32_1a_accumulate_block(hash, &(expr), sizeof(expr))
+
+/**
+ * This foreach function is safe against deletion (which just replaces
+ * an entry's data with the deleted marker), but not against insertion
+ * (which may rehash the table, making entry a dangling pointer).
+ */
+#define hash_table_foreach(ht, entry)                                      \
+   for (struct hash_entry *entry = _mesa_hash_table_next_entry(ht, NULL);  \
+        entry != NULL;                                                     \
+        entry = _mesa_hash_table_next_entry(ht, entry))
+
+static inline void
+hash_table_call_foreach(struct hash_table *ht,
+                        void (*callback)(const void *key,
+                                         void *data,
+                                         void *closure),
+                        void *closure)
+{
+   hash_table_foreach(ht, entry)
+      callback(entry->key, entry->data, closure);
+}
+
+/**
+ * Hash table wrapper which supports 64-bit keys.
+ */
+struct hash_table_u64 {
+   struct hash_table *table;
+   void *freed_key_data;
+   void *deleted_key_data;
+};
+
+struct hash_table_u64 *
+_mesa_hash_table_u64_create(void *mem_ctx);
+
+void
+_mesa_hash_table_u64_destroy(struct hash_table_u64 *ht,
+                             void (*delete_function)(struct hash_entry *entry));
+
+void
+_mesa_hash_table_u64_insert(struct hash_table_u64 *ht, uint64_t key,
+                            void *data);
+
+void *
+_mesa_hash_table_u64_search(struct hash_table_u64 *ht, uint64_t key);
+
+void
+_mesa_hash_table_u64_remove(struct hash_table_u64 *ht, uint64_t key);
+
+void
+_mesa_hash_table_u64_clear(struct hash_table_u64 *ht,
+                           void (*delete_function)(struct hash_entry *entry));
+
+#ifdef __cplusplus
+} /* extern C */
+#endif
+
+#endif /* _HASH_TABLE_H */
diff --git a/src/mesa/util/list.h b/src/mesa/util/list.h
new file mode 100644
index 00000000..91b6cd46
--- /dev/null
+++ b/src/mesa/util/list.h
@@ -0,0 +1,249 @@
+/**************************************************************************
+ *
+ * Copyright 2006 VMware, Inc., Bismarck, ND. USA.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * List macros heavily inspired by the Linux kernel
+ * list handling. No list looping yet.
+ *
+ * Is not threadsafe, so common operations need to
+ * be protected using an external mutex.
+ */
+
+#ifndef _UTIL_LIST_H_
+#define _UTIL_LIST_H_
+
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <assert.h>
+#include "../c99_compat.h"
+
+#ifdef DEBUG
+#  define list_assert(cond, msg)  assert(cond && msg)
+#else
+#  define list_assert(cond, msg)  (void)(0 && (cond))
+#endif
+
+struct list_head
+{
+    struct list_head *prev;
+    struct list_head *next;
+};
+
+static inline void list_inithead(struct list_head *item)
+{
+    item->prev = item;
+    item->next = item;
+}
+
+static inline void list_add(struct list_head *item, struct list_head *list)
+{
+    item->prev = list;
+    item->next = list->next;
+    list->next->prev = item;
+    list->next = item;
+}
+
+static inline void list_addtail(struct list_head *item, struct list_head *list)
+{
+    item->next = list;
+    item->prev = list->prev;
+    list->prev->next = item;
+    list->prev = item;
+}
+
+static inline bool list_is_empty(const struct list_head *list);
+
+static inline void list_replace(struct list_head *from, struct list_head *to)
+{
+    if (list_is_empty(from)) {
+        list_inithead(to);
+    } else {
+        to->prev = from->prev;
+        to->next = from->next;
+        from->next->prev = to;
+        from->prev->next = to;
+    }
+}
+
+static inline void list_del(struct list_head *item)
+{
+    item->prev->next = item->next;
+    item->next->prev = item->prev;
+    item->prev = item->next = NULL;
+}
+
+static inline void list_delinit(struct list_head *item)
+{
+    item->prev->next = item->next;
+    item->next->prev = item->prev;
+    item->next = item;
+    item->prev = item;
+}
+
+static inline bool list_is_empty(const struct list_head *list)
+{
+   return list->next == list;
+}
+
+/**
+ * Returns whether the list has exactly one element.
+ */
+static inline bool list_is_singular(const struct list_head *list)
+{
+   return list->next != NULL && list->next != list && list->next->next == list;
+}
+
+static inline unsigned list_length(const struct list_head *list)
+{
+   struct list_head *node;
+   unsigned length = 0;
+   for (node = list->next; node != list; node = node->next)
+      length++;
+   return length;
+}
+
+static inline void list_splice(struct list_head *src, struct list_head *dst)
+{
+   if (list_is_empty(src))
+      return;
+
+   src->next->prev = dst;
+   src->prev->next = dst->next;
+   dst->next->prev = src->prev;
+   dst->next = src->next;
+}
+
+static inline void list_splicetail(struct list_head *src, struct list_head *dst)
+{
+   if (list_is_empty(src))
+      return;
+
+   src->prev->next = dst;
+   src->next->prev = dst->prev;
+   dst->prev->next = src->next;
+   dst->prev = src->prev;
+}
+
+static inline void list_validate(const struct list_head *list)
+{
+   struct list_head *node;
+   assert(list->next->prev == list && list->prev->next == list);
+   for (node = list->next; node != list; node = node->next)
+      assert(node->next->prev == node && node->prev->next == node);
+}
+
+#define LIST_ENTRY(__type, __item, __field)   \
+    ((__type *)(((char *)(__item)) - offsetof(__type, __field)))
+
+/**
+ * Cast from a pointer to a member of a struct back to the containing struct.
+ *
+ * 'sample' MUST be initialized, or else the result is undefined!
+ */
+#ifndef container_of
+#define container_of(ptr, sample, member)				\
+    (void *)((char *)(ptr)						\
+	     - ((char *)&(sample)->member - (char *)(sample)))
+#endif
+
+#define list_first_entry(ptr, type, member) \
+        LIST_ENTRY(type, (ptr)->next, member)
+
+#define list_last_entry(ptr, type, member) \
+        LIST_ENTRY(type, (ptr)->prev, member)
+
+
+#define LIST_FOR_EACH_ENTRY(pos, head, member)				\
+   for (pos = NULL, pos = container_of((head)->next, pos, member);	\
+	&pos->member != (head);						\
+	pos = container_of(pos->member.next, pos, member))
+
+#define LIST_FOR_EACH_ENTRY_SAFE(pos, storage, head, member)	\
+   for (pos = NULL, pos = container_of((head)->next, pos, member),	\
+	storage = container_of(pos->member.next, pos, member);	\
+	&pos->member != (head);						\
+	pos = storage, storage = container_of(storage->member.next, storage, member))
+
+#define LIST_FOR_EACH_ENTRY_SAFE_REV(pos, storage, head, member)	\
+   for (pos = NULL, pos = container_of((head)->prev, pos, member),	\
+	storage = container_of(pos->member.prev, pos, member);		\
+	&pos->member != (head);						\
+	pos = storage, storage = container_of(storage->member.prev, storage, member))
+
+#define LIST_FOR_EACH_ENTRY_FROM(pos, start, head, member)		\
+   for (pos = NULL, pos = container_of((start), pos, member);		\
+	&pos->member != (head);						\
+	pos = container_of(pos->member.next, pos, member))
+
+#define LIST_FOR_EACH_ENTRY_FROM_REV(pos, start, head, member)		\
+   for (pos = NULL, pos = container_of((start), pos, member);		\
+	&pos->member != (head);						\
+	pos = container_of(pos->member.prev, pos, member))
+
+#define list_for_each_entry(type, pos, head, member)                    \
+   for (type *pos = LIST_ENTRY(type, (head)->next, member),             \
+	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
+	&pos->member != (head);                                         \
+	pos = LIST_ENTRY(type, pos->member.next, member),               \
+	list_assert(pos == __next, "use _safe iterator"),               \
+	__next = LIST_ENTRY(type, __next->member.next, member))
+
+#define list_for_each_entry_safe(type, pos, head, member)               \
+   for (type *pos = LIST_ENTRY(type, (head)->next, member),             \
+	     *__next = LIST_ENTRY(type, pos->member.next, member);      \
+	&pos->member != (head);                                         \
+	pos = __next,                                                   \
+	__next = LIST_ENTRY(type, __next->member.next, member))
+
+#define list_for_each_entry_rev(type, pos, head, member)                \
+   for (type *pos = LIST_ENTRY(type, (head)->prev, member),             \
+	     *__prev = LIST_ENTRY(type, pos->member.prev, member);      \
+	&pos->member != (head);                                         \
+	pos = LIST_ENTRY(type, pos->member.prev, member),               \
+	list_assert(pos == __prev, "use _safe iterator"),               \
+	__prev = LIST_ENTRY(type, __prev->member.prev, member))
+
+#define list_for_each_entry_safe_rev(type, pos, head, member)           \
+   for (type *pos = LIST_ENTRY(type, (head)->prev, member),             \
+	     *__prev = LIST_ENTRY(type, pos->member.prev, member);      \
+	&pos->member != (head);                                         \
+	pos = __prev,                                                   \
+        __prev = LIST_ENTRY(type, __prev->member.prev, member))
+
+#define list_for_each_entry_from(type, pos, start, head, member)        \
+   for (type *pos = LIST_ENTRY(type, (start), member);                  \
+	&pos->member != (head);                                         \
+	pos = LIST_ENTRY(type, pos->member.next, member))
+
+#define list_for_each_entry_from_rev(type, pos, start, head, member)    \
+   for (type *pos = LIST_ENTRY(type, (start), member);                  \
+	&pos->member != (head);                                         \
+	pos = LIST_ENTRY(type, pos->member.prev, member))
+
+#endif /*_UTIL_LIST_H_*/
diff --git a/src/mesa/util/macros.h b/src/mesa/util/macros.h
new file mode 100644
index 00000000..16c88dbb
--- /dev/null
+++ b/src/mesa/util/macros.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef UTIL_MACROS_H
+#define UTIL_MACROS_H
+
+#include <assert.h>
+
+#include "../c99_compat.h"
+#include "../c11_compat.h"
+
+/* Compute the size of an array */
+#ifndef ARRAY_SIZE
+#  define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#endif
+
+/* For compatibility with Clang's __has_builtin() */
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+/**
+ * __builtin_expect macros
+ */
+#if !defined(HAVE___BUILTIN_EXPECT)
+#  define __builtin_expect(x, y) (x)
+#endif
+
+#ifndef likely
+#  ifdef HAVE___BUILTIN_EXPECT
+#    define likely(x)   __builtin_expect(!!(x), 1)
+#    define unlikely(x) __builtin_expect(!!(x), 0)
+#  else
+#    define likely(x)   (x)
+#    define unlikely(x) (x)
+#  endif
+#endif
+
+
+/**
+ * Static (compile-time) assertion.
+ * Basically, use COND to dimension an array.  If COND is false/zero the
+ * array size will be -1 and we'll get a compilation error.
+ */
+#define STATIC_ASSERT(COND) \
+   do { \
+      (void) sizeof(char [1 - 2*!(COND)]); \
+   } while (0)
+
+
+/**
+ * Unreachable macro. Useful for suppressing "control reaches end of non-void
+ * function" warnings.
+ */
+#if defined(HAVE___BUILTIN_UNREACHABLE) || __has_builtin(__builtin_unreachable)
+#define unreachable(str)    \
+do {                        \
+   assert(!str);            \
+   __builtin_unreachable(); \
+} while (0)
+#elif defined (_MSC_VER)
+#define unreachable(str)    \
+do {                        \
+   assert(!str);            \
+   __assume(0);             \
+} while (0)
+#else
+#define unreachable(str) assert(!str)
+#endif
+
+/**
+ * Assume macro. Useful for expressing our assumptions to the compiler,
+ * typically for purposes of silencing warnings.
+ */
+#if __has_builtin(__builtin_assume)
+#define assume(expr)       \
+do {                       \
+   assert(expr);           \
+   __builtin_assume(expr); \
+} while (0)
+#elif defined HAVE___BUILTIN_UNREACHABLE
+#define assume(expr) ((expr) ? ((void) 0) \
+                             : (assert(!"assumption failed"), \
+                                __builtin_unreachable()))
+#elif defined (_MSC_VER)
+#define assume(expr) __assume(expr)
+#else
+#define assume(expr) assert(expr)
+#endif
+
+/* Attribute const is used for functions that have no effects other than their
+ * return value, and only rely on the argument values to compute the return
+ * value.  As a result, calls to it can be CSEed.  Note that using memory
+ * pointed to by the arguments is not allowed for const functions.
+ */
+#ifdef HAVE_FUNC_ATTRIBUTE_CONST
+#define ATTRIBUTE_CONST __attribute__((__const__))
+#else
+#define ATTRIBUTE_CONST
+#endif
+
+#ifdef HAVE_FUNC_ATTRIBUTE_FLATTEN
+#define FLATTEN __attribute__((__flatten__))
+#else
+#define FLATTEN
+#endif
+
+#ifdef HAVE_FUNC_ATTRIBUTE_FORMAT
+#define PRINTFLIKE(f, a) __attribute__ ((format(__printf__, f, a)))
+#else
+#define PRINTFLIKE(f, a)
+#endif
+
+#ifdef HAVE_FUNC_ATTRIBUTE_MALLOC
+#define MALLOCLIKE __attribute__((__malloc__))
+#else
+#define MALLOCLIKE
+#endif
+
+/* Forced function inlining */
+/* Note: Clang also sets __GNUC__ (see other cases below) */
+#ifndef ALWAYS_INLINE
+#  if defined(__GNUC__)
+#    define ALWAYS_INLINE inline __attribute__((always_inline))
+#  elif defined(_MSC_VER)
+#    define ALWAYS_INLINE __forceinline
+#  else
+#    define ALWAYS_INLINE inline
+#  endif
+#endif
+
+/* Used to optionally mark structures with misaligned elements or size as
+ * packed, to trade off performance for space.
+ */
+#ifdef HAVE_FUNC_ATTRIBUTE_PACKED
+#define PACKED __attribute__((__packed__))
+#else
+#define PACKED
+#endif
+
+/* Attribute pure is used for functions that have no effects other than their
+ * return value.  As a result, calls to it can be dead code eliminated.
+ */
+#ifdef HAVE_FUNC_ATTRIBUTE_PURE
+#define ATTRIBUTE_PURE __attribute__((__pure__))
+#else
+#define ATTRIBUTE_PURE
+#endif
+
+#ifdef HAVE_FUNC_ATTRIBUTE_RETURNS_NONNULL
+#define ATTRIBUTE_RETURNS_NONNULL __attribute__((__returns_nonnull__))
+#else
+#define ATTRIBUTE_RETURNS_NONNULL
+#endif
+
+#ifndef NORETURN
+#  ifdef _MSC_VER
+#    define NORETURN __declspec(noreturn)
+#  elif defined HAVE_FUNC_ATTRIBUTE_NORETURN
+#    define NORETURN __attribute__((__noreturn__))
+#  else
+#    define NORETURN
+#  endif
+#endif
+
+#ifdef __cplusplus
+/**
+ * Macro function that evaluates to true if T is a trivially
+ * destructible type -- that is, if its (non-virtual) destructor
+ * performs no action and all member variables and base classes are
+ * trivially destructible themselves.
+ */
+#   if (defined(__clang__) && defined(__has_feature))
+#      if __has_feature(has_trivial_destructor)
+#         define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
+#      endif
+#   elif defined(__GNUC__)
+#      if ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 3)))
+#         define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
+#      endif
+#   elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
+#      define HAS_TRIVIAL_DESTRUCTOR(T) __has_trivial_destructor(T)
+#   endif
+#   ifndef HAS_TRIVIAL_DESTRUCTOR
+       /* It's always safe (if inefficient) to assume that a
+        * destructor is non-trivial.
+        */
+#      define HAS_TRIVIAL_DESTRUCTOR(T) (false)
+#   endif
+#endif
+
+/**
+ * PUBLIC/USED macros
+ *
+ * If we build the library with gcc's -fvisibility=hidden flag, we'll
+ * use the PUBLIC macro to mark functions that are to be exported.
+ *
+ * We also need to define a USED attribute, so the optimizer doesn't
+ * inline a static function that we later use in an alias. - ajax
+ */
+#ifndef PUBLIC
+#  if defined(__GNUC__)
+#    define PUBLIC __attribute__((visibility("default")))
+#    define USED __attribute__((used))
+#  elif defined(_MSC_VER)
+#    define PUBLIC __declspec(dllexport)
+#    define USED
+#  else
+#    define PUBLIC
+#    define USED
+#  endif
+#endif
+
+/**
+ * UNUSED marks variables (or sometimes functions) that have to be defined,
+ * but are sometimes (or always) unused beyond that. A common case is for
+ * a function parameter to be used in some build configurations but not others.
+ * Another case is fallback vfuncs that don't do anything with their params.
+ *
+ * Note that this should not be used for identifiers used in `assert()`;
+ * see ASSERTED below.
+ */
+#ifdef HAVE_FUNC_ATTRIBUTE_UNUSED
+#define UNUSED __attribute__((unused))
+#else
+#define UNUSED
+#endif
+
+/**
+ * Use ASSERTED to indicate that an identifier is unused outside of an `assert()`,
+ * so that assert-free builds don't get "unused variable" warnings.
+ */
+#ifdef NDEBUG
+#define ASSERTED UNUSED
+#else
+#define ASSERTED
+#endif
+
+#ifdef HAVE_FUNC_ATTRIBUTE_WARN_UNUSED_RESULT
+#define MUST_CHECK __attribute__((warn_unused_result))
+#else
+#define MUST_CHECK
+#endif
+
+#if defined(__GNUC__)
+#define ATTRIBUTE_NOINLINE __attribute__((noinline))
+#else
+#define ATTRIBUTE_NOINLINE
+#endif
+
+
+/**
+ * Check that STRUCT::FIELD can hold MAXVAL.  We use a lot of bitfields
+ * in Mesa/gallium.  We have to be sure they're of sufficient size to
+ * hold the largest expected value.
+ * Note that with MSVC, enums are signed and enum bitfields need one extra
+ * high bit (always zero) to ensure the max value is handled correctly.
+ * This macro will detect that with MSVC, but not GCC.
+ */
+#define ASSERT_BITFIELD_SIZE(STRUCT, FIELD, MAXVAL) \
+   do { \
+      ASSERTED STRUCT s; \
+      s.FIELD = (MAXVAL); \
+      assert((int) s.FIELD == (MAXVAL) && "Insufficient bitfield size!"); \
+   } while (0)
+
+
+/** Compute ceiling of integer quotient of A divided by B. */
+#define DIV_ROUND_UP( A, B )  ( ((A) + (B) - 1) / (B) )
+
+/** Clamp X to [MIN,MAX].  Turn NaN into MIN, arbitrarily. */
+#define CLAMP( X, MIN, MAX )  ( (X)>(MIN) ? ((X)>(MAX) ? (MAX) : (X)) : (MIN) )
+
+/** Minimum of two values: */
+#define MIN2( A, B )   ( (A)<(B) ? (A) : (B) )
+
+/** Maximum of two values: */
+#define MAX2( A, B )   ( (A)>(B) ? (A) : (B) )
+
+/** Minimum and maximum of three values: */
+#define MIN3( A, B, C ) ((A) < (B) ? MIN2(A, C) : MIN2(B, C))
+#define MAX3( A, B, C ) ((A) > (B) ? MAX2(A, C) : MAX2(B, C))
+
+/** Align a value to a power of two */
+#define ALIGN_POT(x, pot_align) (((x) + (pot_align) - 1) & ~((pot_align) - 1))
+
+/**
+ * Macro for declaring an explicit conversion operator.  Defaults to an
+ * implicit conversion if C++11 is not supported.
+ */
+#if __cplusplus >= 201103L
+#define EXPLICIT_CONVERSION explicit
+#elif defined(__cplusplus)
+#define EXPLICIT_CONVERSION
+#endif
+
+/** Set a single bit */
+#define BITFIELD_BIT(b)      (1u << (b))
+/** Set all bits up to excluding bit b */
+#define BITFIELD_MASK(b)      \
+   ((b) == 32 ? (~0u) : BITFIELD_BIT((b) % 32) - 1)
+/** Set count bits starting from bit b  */
+#define BITFIELD_RANGE(b, count) \
+   (BITFIELD_MASK((b) + (count)) & ~BITFIELD_MASK(b))
+
+/** Set a single bit */
+#define BITFIELD64_BIT(b)      (1ull << (b))
+/** Set all bits up to excluding bit b */
+#define BITFIELD64_MASK(b)      \
+   ((b) == 64 ? (~0ull) : BITFIELD64_BIT(b) - 1)
+/** Set count bits starting from bit b  */
+#define BITFIELD64_RANGE(b, count) \
+   (BITFIELD64_MASK((b) + (count)) & ~BITFIELD64_MASK(b))
+
+#endif /* UTIL_MACROS_H */
diff --git a/src/mesa/util/os_socket.c b/src/mesa/util/os_socket.c
new file mode 100644
index 00000000..98ef0132
--- /dev/null
+++ b/src/mesa/util/os_socket.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2019 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <errno.h>
+
+#include "os_socket.h"
+
+#if defined(__linux__)
+
+#include <fcntl.h>
+#include <poll.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+int
+os_socket_listen_abstract(const char *path, int count)
+{
+   int s = socket(AF_UNIX, SOCK_STREAM, 0);
+   if (s < 0)
+      return -1;
+
+   struct sockaddr_un addr;
+   memset(&addr, 0, sizeof(addr));
+   addr.sun_family = AF_UNIX;
+   strncpy(addr.sun_path + 1, path, sizeof(addr.sun_path) - 2);
+
+   /* Create an abstract socket */
+   int ret = bind(s, (struct sockaddr*)&addr,
+                  offsetof(struct sockaddr_un, sun_path) +
+                  strlen(path) + 1);
+   if (ret < 0)
+      return -1;
+
+   listen(s, count);
+
+   return s;
+}
+
+int
+os_socket_accept(int s)
+{
+   return accept(s, NULL, NULL);
+}
+
+ssize_t
+os_socket_recv(int socket, void *buffer, size_t length, int flags)
+{
+   return recv(socket, buffer, length, flags);
+}
+
+ssize_t
+os_socket_send(int socket, const void *buffer, size_t length, int flags)
+{
+   return send(socket, buffer, length, flags);
+}
+
+void
+os_socket_block(int s, bool block)
+{
+   int old = fcntl(s, F_GETFL, 0);
+   if (old == -1)
+      return;
+
+   /* TODO obey block */
+   if (block)
+      fcntl(s, F_SETFL, old & ~O_NONBLOCK);
+   else
+      fcntl(s, F_SETFL, old | O_NONBLOCK);
+}
+
+void
+os_socket_close(int s)
+{
+   close(s);
+}
+
+#else
+
+int
+os_socket_listen_abstract(const char *path, int count)
+{
+   errno = -ENOSYS;
+   return -1;
+}
+
+int
+os_socket_accept(int s)
+{
+   errno = -ENOSYS;
+   return -1;
+}
+
+ssize_t
+os_socket_recv(int socket, void *buffer, size_t length, int flags)
+{
+   errno = -ENOSYS;
+   return -1;
+}
+
+ssize_t
+os_socket_send(int socket, const void *buffer, size_t length, int flags)
+{
+   errno = -ENOSYS;
+   return -1;
+}
+
+void
+os_socket_block(int s, bool block)
+{
+}
+
+void
+os_socket_close(int s)
+{
+}
+
+#endif
diff --git a/src/mesa/util/os_socket.h b/src/mesa/util/os_socket.h
new file mode 100644
index 00000000..0d6f8749
--- /dev/null
+++ b/src/mesa/util/os_socket.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2019 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ *
+ * Socket operations helpers
+ */
+
+#ifndef _OS_SOCKET_H_
+#define _OS_SOCKET_H_
+
+#include <stdio.h>
+#include <stdbool.h>
+#ifdef _MSC_VER
+#include <BaseTsd.h>
+typedef SSIZE_T ssize_t;
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int os_socket_accept(int s);
+
+int os_socket_listen_abstract(const char *path, int count);
+
+ssize_t os_socket_recv(int socket, void *buffer, size_t length, int flags);
+ssize_t os_socket_send(int socket, const void *buffer, size_t length, int flags);
+
+void os_socket_block(int s, bool block);
+void os_socket_close(int s);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OS_SOCKET_H_ */
diff --git a/src/mesa/util/os_time.c b/src/mesa/util/os_time.c
new file mode 100644
index 00000000..969ce5ca
--- /dev/null
+++ b/src/mesa/util/os_time.c
@@ -0,0 +1,195 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * OS independent time-manipulation functions.
+ * 
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#include "os_time.h"
+#include "detect_os.h"
+
+#if defined(USE_GCC_ATOMIC_BUILTINS)
+/* The builtins with explicit memory model are available since GCC 4.7. */
+#define p_atomic_read(_v) __atomic_load_n((_v), __ATOMIC_ACQUIRE)
+#else
+#define p_atomic_read(_v) (*(_v))
+#endif
+
+#if DETECT_OS_UNIX
+#  include <unistd.h> /* usleep */
+#  include <time.h> /* timeval */
+#  include <sys/time.h> /* timeval */
+#  include <sched.h> /* sched_yield */
+#  include <errno.h>
+#elif DETECT_OS_WINDOWS
+#  include <windows.h>
+#else
+#  error Unsupported OS
+#endif
+
+
+int64_t
+os_time_get_nano(void)
+{
+#if DETECT_OS_LINUX || DETECT_OS_BSD
+
+   struct timespec tv;
+   clock_gettime(CLOCK_MONOTONIC, &tv);
+   return tv.tv_nsec + tv.tv_sec*INT64_C(1000000000);
+
+#elif DETECT_OS_UNIX
+
+   struct timeval tv;
+   gettimeofday(&tv, NULL);
+   return tv.tv_usec*INT64_C(1000) + tv.tv_sec*INT64_C(1000000000);
+
+#elif DETECT_OS_WINDOWS
+
+   static LARGE_INTEGER frequency;
+   LARGE_INTEGER counter;
+   int64_t secs, nanosecs;
+   if(!frequency.QuadPart)
+      QueryPerformanceFrequency(&frequency);
+   QueryPerformanceCounter(&counter);
+   /* Compute seconds and nanoseconds parts separately to
+    * reduce severity of precision loss.
+    */
+   secs = counter.QuadPart / frequency.QuadPart;
+   nanosecs = (counter.QuadPart % frequency.QuadPart) * INT64_C(1000000000)
+      / frequency.QuadPart;
+   return secs*INT64_C(1000000000) + nanosecs;
+
+#else
+
+#error Unsupported OS
+
+#endif
+}
+
+
+
+void
+os_time_sleep(int64_t usecs)
+{
+#if DETECT_OS_LINUX
+   struct timespec time;
+   time.tv_sec = usecs / 1000000;
+   time.tv_nsec = (usecs % 1000000) * 1000;
+   while (clock_nanosleep(CLOCK_MONOTONIC, 0, &time, &time) == EINTR);
+
+#elif DETECT_OS_UNIX
+   usleep(usecs);
+
+#elif DETECT_OS_WINDOWS
+   DWORD dwMilliseconds = (DWORD) ((usecs + 999) / 1000);
+   /* Avoid Sleep(O) as that would cause to sleep for an undetermined duration */
+   if (dwMilliseconds) {
+      Sleep(dwMilliseconds);
+   }
+#else
+#  error Unsupported OS
+#endif
+}
+
+
+
+int64_t
+os_time_get_absolute_timeout(uint64_t timeout)
+{
+   int64_t time, abs_timeout;
+
+   /* Also check for the type upper bound. */
+   if (timeout == OS_TIMEOUT_INFINITE || timeout > INT64_MAX)
+      return OS_TIMEOUT_INFINITE;
+
+   time = os_time_get_nano();
+   abs_timeout = time + (int64_t)timeout;
+
+   /* Check for overflow. */
+   if (abs_timeout < time)
+      return OS_TIMEOUT_INFINITE;
+
+   return abs_timeout;
+}
+
+
+bool
+os_wait_until_zero(volatile int *var, uint64_t timeout)
+{
+   if (!p_atomic_read(var))
+      return true;
+
+   if (!timeout)
+      return false;
+
+   if (timeout == OS_TIMEOUT_INFINITE) {
+      while (p_atomic_read(var)) {
+#if DETECT_OS_UNIX
+         sched_yield();
+#endif
+      }
+      return true;
+   }
+   else {
+      int64_t start_time = os_time_get_nano();
+      int64_t end_time = start_time + timeout;
+
+      while (p_atomic_read(var)) {
+         if (os_time_timeout(start_time, end_time, os_time_get_nano()))
+            return false;
+
+#if DETECT_OS_UNIX
+         sched_yield();
+#endif
+      }
+      return true;
+   }
+}
+
+
+bool
+os_wait_until_zero_abs_timeout(volatile int *var, int64_t timeout)
+{
+   if (!p_atomic_read(var))
+      return true;
+
+   if (timeout == OS_TIMEOUT_INFINITE)
+      return os_wait_until_zero(var, OS_TIMEOUT_INFINITE);
+
+   while (p_atomic_read(var)) {
+      if (os_time_get_nano() >= timeout)
+         return false;
+
+#if DETECT_OS_UNIX
+      sched_yield();
+#endif
+   }
+   return true;
+}
diff --git a/src/mesa/util/os_time.h b/src/mesa/util/os_time.h
new file mode 100644
index 00000000..049ab118
--- /dev/null
+++ b/src/mesa/util/os_time.h
@@ -0,0 +1,130 @@
+/**************************************************************************
+ *
+ * Copyright 2008-2010 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * OS independent time-manipulation functions.
+ * 
+ * @author Jose Fonseca <jfonseca@vmware.com>
+ */
+
+#ifndef _OS_TIME_H_
+#define _OS_TIME_H_
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* must be equal to PIPE_TIMEOUT_INFINITE */
+#define OS_TIMEOUT_INFINITE 0xffffffffffffffffull
+
+/*
+ * Get the current time in nanoseconds from an unknown base.
+ */
+int64_t
+os_time_get_nano(void);
+
+
+/*
+ * Get the current time in microseconds from an unknown base.
+ */
+static inline int64_t
+os_time_get(void)
+{
+   return os_time_get_nano() / 1000;
+}
+
+
+/*
+ * Sleep.
+ */
+void
+os_time_sleep(int64_t usecs);
+
+
+/*
+ * Helper function for detecting time outs, taking in account overflow.
+ *
+ * Returns true if the current time has elapsed beyond the specified interval.
+ */
+static inline bool
+os_time_timeout(int64_t start,
+                int64_t end,
+                int64_t curr)
+{
+   if (start <= end)
+      return !(start <= curr && curr < end);
+   else
+      return !((start <= curr) || (curr < end));
+}
+
+
+/**
+ * Convert a relative timeout in nanoseconds into an absolute timeout,
+ * in other words, it returns current time + timeout.
+ * os_time_get_nano() must be monotonic.
+ * OS_TIMEOUT_INFINITE is passed through unchanged. If the calculation
+ * overflows, OS_TIMEOUT_INFINITE is returned.
+ */
+int64_t
+os_time_get_absolute_timeout(uint64_t timeout);
+
+
+/**
+ * Wait until the variable at the given memory location is zero.
+ *
+ * \param var           variable
+ * \param timeout       timeout in ns, can be anything from 0 (no wait) to
+ *                      OS_TIMEOUT_INFINITE (wait forever)
+ * \return     true if the variable is zero
+ */
+bool
+os_wait_until_zero(volatile int *var, uint64_t timeout);
+
+
+/**
+ * Wait until the variable at the given memory location is zero.
+ * The timeout is the absolute time when the waiting should stop. If it is
+ * less than or equal to the current time, it only returns the status and
+ * doesn't wait. OS_TIMEOUT_INFINITE waits forever. This requires that
+ * os_time_get_nano is monotonic.
+ *
+ * \param var       variable
+ * \param timeout   the time in ns when the waiting should stop
+ * \return     true if the variable is zero
+ */
+bool
+os_wait_until_zero_abs_timeout(volatile int *var, int64_t timeout);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _OS_TIME_H_ */
diff --git a/src/mesa/util/ralloc.c b/src/mesa/util/ralloc.c
new file mode 100644
index 00000000..0d20223d
--- /dev/null
+++ b/src/mesa/util/ralloc.c
@@ -0,0 +1,920 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+
+/* Some versions of MinGW are missing _vscprintf's declaration, although they
+ * still provide the symbol in the import library. */
+#ifdef __MINGW32__
+_CRTIMP int _vscprintf(const char *format, va_list argptr);
+#endif
+
+#include "ralloc.h"
+
+#ifndef va_copy
+#ifdef __va_copy
+#define va_copy(dest, src) __va_copy((dest), (src))
+#else
+#define va_copy(dest, src) (dest) = (src)
+#endif
+#endif
+
+#define CANARY 0x5A1106
+
+/* Align the header's size so that ralloc() allocations will return with the
+ * same alignment as a libc malloc would have (8 on 32-bit GLIBC, 16 on
+ * 64-bit), avoiding performance penalities on x86 and alignment faults on
+ * ARM.
+ */
+struct
+#ifdef _MSC_VER
+ __declspec(align(8))
+#elif defined(__LP64__)
+ __attribute__((aligned(16)))
+#else
+ __attribute__((aligned(8)))
+#endif
+   ralloc_header
+{
+#ifndef NDEBUG
+   /* A canary value used to determine whether a pointer is ralloc'd. */
+   unsigned canary;
+#endif
+
+   struct ralloc_header *parent;
+
+   /* The first child (head of a linked list) */
+   struct ralloc_header *child;
+
+   /* Linked list of siblings */
+   struct ralloc_header *prev;
+   struct ralloc_header *next;
+
+   void (*destructor)(void *);
+};
+
+typedef struct ralloc_header ralloc_header;
+
+static void unlink_block(ralloc_header *info);
+static void unsafe_free(ralloc_header *info);
+
+static ralloc_header *
+get_header(const void *ptr)
+{
+   ralloc_header *info = (ralloc_header *) (((char *) ptr) -
+					    sizeof(ralloc_header));
+   assert(info->canary == CANARY);
+   return info;
+}
+
+#define PTR_FROM_HEADER(info) (((char *) info) + sizeof(ralloc_header))
+
+static void
+add_child(ralloc_header *parent, ralloc_header *info)
+{
+   if (parent != NULL) {
+      info->parent = parent;
+      info->next = parent->child;
+      parent->child = info;
+
+      if (info->next != NULL)
+	 info->next->prev = info;
+   }
+}
+
+void *
+ralloc_context(const void *ctx)
+{
+   return ralloc_size(ctx, 0);
+}
+
+void *
+ralloc_size(const void *ctx, size_t size)
+{
+   void *block = malloc(size + sizeof(ralloc_header));
+   ralloc_header *info;
+   ralloc_header *parent;
+
+   if (unlikely(block == NULL))
+      return NULL;
+
+   info = (ralloc_header *) block;
+   /* measurements have shown that calloc is slower (because of
+    * the multiplication overflow checking?), so clear things
+    * manually
+    */
+   info->parent = NULL;
+   info->child = NULL;
+   info->prev = NULL;
+   info->next = NULL;
+   info->destructor = NULL;
+
+   parent = ctx != NULL ? get_header(ctx) : NULL;
+
+   add_child(parent, info);
+
+#ifndef NDEBUG
+   info->canary = CANARY;
+#endif
+
+   return PTR_FROM_HEADER(info);
+}
+
+void *
+rzalloc_size(const void *ctx, size_t size)
+{
+   void *ptr = ralloc_size(ctx, size);
+
+   if (likely(ptr))
+      memset(ptr, 0, size);
+
+   return ptr;
+}
+
+/* helper function - assumes ptr != NULL */
+static void *
+resize(void *ptr, size_t size)
+{
+   ralloc_header *child, *old, *info;
+
+   old = get_header(ptr);
+   info = realloc(old, size + sizeof(ralloc_header));
+
+   if (info == NULL)
+      return NULL;
+
+   /* Update parent and sibling's links to the reallocated node. */
+   if (info != old && info->parent != NULL) {
+      if (info->parent->child == old)
+	 info->parent->child = info;
+
+      if (info->prev != NULL)
+	 info->prev->next = info;
+
+      if (info->next != NULL)
+	 info->next->prev = info;
+   }
+
+   /* Update child->parent links for all children */
+   for (child = info->child; child != NULL; child = child->next)
+      child->parent = info;
+
+   return PTR_FROM_HEADER(info);
+}
+
+void *
+reralloc_size(const void *ctx, void *ptr, size_t size)
+{
+   if (unlikely(ptr == NULL))
+      return ralloc_size(ctx, size);
+
+   assert(ralloc_parent(ptr) == ctx);
+   return resize(ptr, size);
+}
+
+void *
+rerzalloc_size(const void *ctx, void *ptr, size_t old_size, size_t new_size)
+{
+   if (unlikely(ptr == NULL))
+      return rzalloc_size(ctx, new_size);
+
+   assert(ralloc_parent(ptr) == ctx);
+   ptr = resize(ptr, new_size);
+
+   if (new_size > old_size)
+      memset((char *)ptr + old_size, 0, new_size - old_size);
+
+   return ptr;
+}
+
+void *
+ralloc_array_size(const void *ctx, size_t size, unsigned count)
+{
+   if (count > SIZE_MAX/size)
+      return NULL;
+
+   return ralloc_size(ctx, size * count);
+}
+
+void *
+rzalloc_array_size(const void *ctx, size_t size, unsigned count)
+{
+   if (count > SIZE_MAX/size)
+      return NULL;
+
+   return rzalloc_size(ctx, size * count);
+}
+
+void *
+reralloc_array_size(const void *ctx, void *ptr, size_t size, unsigned count)
+{
+   if (count > SIZE_MAX/size)
+      return NULL;
+
+   return reralloc_size(ctx, ptr, size * count);
+}
+
+void *
+rerzalloc_array_size(const void *ctx, void *ptr, size_t size,
+                     unsigned old_count, unsigned new_count)
+{
+   if (new_count > SIZE_MAX/size)
+      return NULL;
+
+   return rerzalloc_size(ctx, ptr, size * old_count, size * new_count);
+}
+
+void
+ralloc_free(void *ptr)
+{
+   ralloc_header *info;
+
+   if (ptr == NULL)
+      return;
+
+   info = get_header(ptr);
+   unlink_block(info);
+   unsafe_free(info);
+}
+
+static void
+unlink_block(ralloc_header *info)
+{
+   /* Unlink from parent & siblings */
+   if (info->parent != NULL) {
+      if (info->parent->child == info)
+	 info->parent->child = info->next;
+
+      if (info->prev != NULL)
+	 info->prev->next = info->next;
+
+      if (info->next != NULL)
+	 info->next->prev = info->prev;
+   }
+   info->parent = NULL;
+   info->prev = NULL;
+   info->next = NULL;
+}
+
+static void
+unsafe_free(ralloc_header *info)
+{
+   /* Recursively free any children...don't waste time unlinking them. */
+   ralloc_header *temp;
+   while (info->child != NULL) {
+      temp = info->child;
+      info->child = temp->next;
+      unsafe_free(temp);
+   }
+
+   /* Free the block itself.  Call the destructor first, if any. */
+   if (info->destructor != NULL)
+      info->destructor(PTR_FROM_HEADER(info));
+
+   free(info);
+}
+
+void
+ralloc_steal(const void *new_ctx, void *ptr)
+{
+   ralloc_header *info, *parent;
+
+   if (unlikely(ptr == NULL))
+      return;
+
+   info = get_header(ptr);
+   parent = new_ctx ? get_header(new_ctx) : NULL;
+
+   unlink_block(info);
+
+   add_child(parent, info);
+}
+
+void
+ralloc_adopt(const void *new_ctx, void *old_ctx)
+{
+   ralloc_header *new_info, *old_info, *child;
+
+   if (unlikely(old_ctx == NULL))
+      return;
+
+   old_info = get_header(old_ctx);
+   new_info = get_header(new_ctx);
+
+   /* If there are no children, bail. */
+   if (unlikely(old_info->child == NULL))
+      return;
+
+   /* Set all the children's parent to new_ctx; get a pointer to the last child. */
+   for (child = old_info->child; child->next != NULL; child = child->next) {
+      child->parent = new_info;
+   }
+   child->parent = new_info;
+
+   /* Connect the two lists together; parent them to new_ctx; make old_ctx empty. */
+   child->next = new_info->child;
+   if (child->next)
+      child->next->prev = child;
+   new_info->child = old_info->child;
+   old_info->child = NULL;
+}
+
+void *
+ralloc_parent(const void *ptr)
+{
+   ralloc_header *info;
+
+   if (unlikely(ptr == NULL))
+      return NULL;
+
+   info = get_header(ptr);
+   return info->parent ? PTR_FROM_HEADER(info->parent) : NULL;
+}
+
+void
+ralloc_set_destructor(const void *ptr, void(*destructor)(void *))
+{
+   ralloc_header *info = get_header(ptr);
+   info->destructor = destructor;
+}
+
+char *
+ralloc_strdup(const void *ctx, const char *str)
+{
+   size_t n;
+   char *ptr;
+
+   if (unlikely(str == NULL))
+      return NULL;
+
+   n = strlen(str);
+   ptr = ralloc_array(ctx, char, n + 1);
+   memcpy(ptr, str, n);
+   ptr[n] = '\0';
+   return ptr;
+}
+
+char *
+ralloc_strndup(const void *ctx, const char *str, size_t max)
+{
+   size_t n;
+   char *ptr;
+
+   if (unlikely(str == NULL))
+      return NULL;
+
+   n = strnlen(str, max);
+   ptr = ralloc_array(ctx, char, n + 1);
+   memcpy(ptr, str, n);
+   ptr[n] = '\0';
+   return ptr;
+}
+
+/* helper routine for strcat/strncat - n is the exact amount to copy */
+static bool
+cat(char **dest, const char *str, size_t n)
+{
+   char *both;
+   size_t existing_length;
+   assert(dest != NULL && *dest != NULL);
+
+   existing_length = strlen(*dest);
+   both = resize(*dest, existing_length + n + 1);
+   if (unlikely(both == NULL))
+      return false;
+
+   memcpy(both + existing_length, str, n);
+   both[existing_length + n] = '\0';
+
+   *dest = both;
+   return true;
+}
+
+
+bool
+ralloc_strcat(char **dest, const char *str)
+{
+   return cat(dest, str, strlen(str));
+}
+
+bool
+ralloc_strncat(char **dest, const char *str, size_t n)
+{
+   return cat(dest, str, strnlen(str, n));
+}
+
+bool
+ralloc_str_append(char **dest, const char *str,
+                  size_t existing_length, size_t str_size)
+{
+   char *both;
+   assert(dest != NULL && *dest != NULL);
+
+   both = resize(*dest, existing_length + str_size + 1);
+   if (unlikely(both == NULL))
+      return false;
+
+   memcpy(both + existing_length, str, str_size);
+   both[existing_length + str_size] = '\0';
+
+   *dest = both;
+
+   return true;
+}
+
+char *
+ralloc_asprintf(const void *ctx, const char *fmt, ...)
+{
+   char *ptr;
+   va_list args;
+   va_start(args, fmt);
+   ptr = ralloc_vasprintf(ctx, fmt, args);
+   va_end(args);
+   return ptr;
+}
+
+/* Return the length of the string that would be generated by a printf-style
+ * format and argument list, not including the \0 byte.
+ */
+static size_t
+printf_length(const char *fmt, va_list untouched_args)
+{
+   int size;
+   char junk;
+
+   /* Make a copy of the va_list so the original caller can still use it */
+   va_list args;
+   va_copy(args, untouched_args);
+
+#ifdef _WIN32
+   /* We need to use _vcsprintf to calculate the size as vsnprintf returns -1
+    * if the number of characters to write is greater than count.
+    */
+   size = _vscprintf(fmt, args);
+   (void)junk;
+#else
+   size = vsnprintf(&junk, 1, fmt, args);
+#endif
+   assert(size >= 0);
+
+   va_end(args);
+
+   return size;
+}
+
+char *
+ralloc_vasprintf(const void *ctx, const char *fmt, va_list args)
+{
+   size_t size = printf_length(fmt, args) + 1;
+
+   char *ptr = ralloc_size(ctx, size);
+   if (ptr != NULL)
+      vsnprintf(ptr, size, fmt, args);
+
+   return ptr;
+}
+
+bool
+ralloc_asprintf_append(char **str, const char *fmt, ...)
+{
+   bool success;
+   va_list args;
+   va_start(args, fmt);
+   success = ralloc_vasprintf_append(str, fmt, args);
+   va_end(args);
+   return success;
+}
+
+bool
+ralloc_vasprintf_append(char **str, const char *fmt, va_list args)
+{
+   size_t existing_length;
+   assert(str != NULL);
+   existing_length = *str ? strlen(*str) : 0;
+   return ralloc_vasprintf_rewrite_tail(str, &existing_length, fmt, args);
+}
+
+bool
+ralloc_asprintf_rewrite_tail(char **str, size_t *start, const char *fmt, ...)
+{
+   bool success;
+   va_list args;
+   va_start(args, fmt);
+   success = ralloc_vasprintf_rewrite_tail(str, start, fmt, args);
+   va_end(args);
+   return success;
+}
+
+bool
+ralloc_vasprintf_rewrite_tail(char **str, size_t *start, const char *fmt,
+			      va_list args)
+{
+   size_t new_length;
+   char *ptr;
+
+   assert(str != NULL);
+
+   if (unlikely(*str == NULL)) {
+      // Assuming a NULL context is probably bad, but it's expected behavior.
+      *str = ralloc_vasprintf(NULL, fmt, args);
+      *start = strlen(*str);
+      return true;
+   }
+
+   new_length = printf_length(fmt, args);
+
+   ptr = resize(*str, *start + new_length + 1);
+   if (unlikely(ptr == NULL))
+      return false;
+
+   vsnprintf(ptr + *start, new_length + 1, fmt, args);
+   *str = ptr;
+   *start += new_length;
+   return true;
+}
+
+/***************************************************************************
+ * Linear allocator for short-lived allocations.
+ ***************************************************************************
+ *
+ * The allocator consists of a parent node (2K buffer), which requires
+ * a ralloc parent, and child nodes (allocations). Child nodes can't be freed
+ * directly, because the parent doesn't track them. You have to release
+ * the parent node in order to release all its children.
+ *
+ * The allocator uses a fixed-sized buffer with a monotonically increasing
+ * offset after each allocation. If the buffer is all used, another buffer
+ * is allocated, sharing the same ralloc parent, so all buffers are at
+ * the same level in the ralloc hierarchy.
+ *
+ * The linear parent node is always the first buffer and keeps track of all
+ * other buffers.
+ */
+
+#define MIN_LINEAR_BUFSIZE 2048
+#define SUBALLOC_ALIGNMENT 8
+#define LMAGIC 0x87b9c7d3
+
+struct
+#ifdef _MSC_VER
+ __declspec(align(8))
+#elif defined(__LP64__)
+ __attribute__((aligned(16)))
+#else
+ __attribute__((aligned(8)))
+#endif
+   linear_header {
+#ifndef NDEBUG
+   unsigned magic;   /* for debugging */
+#endif
+   unsigned offset;  /* points to the first unused byte in the buffer */
+   unsigned size;    /* size of the buffer */
+   void *ralloc_parent;          /* new buffers will use this */
+   struct linear_header *next;   /* next buffer if we have more */
+   struct linear_header *latest; /* the only buffer that has free space */
+
+   /* After this structure, the buffer begins.
+    * Each suballocation consists of linear_size_chunk as its header followed
+    * by the suballocation, so it goes:
+    *
+    * - linear_size_chunk
+    * - allocated space
+    * - linear_size_chunk
+    * - allocated space
+    * etc.
+    *
+    * linear_size_chunk is only needed by linear_realloc.
+    */
+};
+
+struct linear_size_chunk {
+   unsigned size; /* for realloc */
+   unsigned _padding;
+};
+
+typedef struct linear_header linear_header;
+typedef struct linear_size_chunk linear_size_chunk;
+
+#define LINEAR_PARENT_TO_HEADER(parent) \
+   (linear_header*) \
+   ((char*)(parent) - sizeof(linear_size_chunk) - sizeof(linear_header))
+
+/* Allocate the linear buffer with its header. */
+static linear_header *
+create_linear_node(void *ralloc_ctx, unsigned min_size)
+{
+   linear_header *node;
+
+   min_size += sizeof(linear_size_chunk);
+
+   if (likely(min_size < MIN_LINEAR_BUFSIZE))
+      min_size = MIN_LINEAR_BUFSIZE;
+
+   node = ralloc_size(ralloc_ctx, sizeof(linear_header) + min_size);
+   if (unlikely(!node))
+      return NULL;
+
+#ifndef NDEBUG
+   node->magic = LMAGIC;
+#endif
+   node->offset = 0;
+   node->size = min_size;
+   node->ralloc_parent = ralloc_ctx;
+   node->next = NULL;
+   node->latest = node;
+   return node;
+}
+
+void *
+linear_alloc_child(void *parent, unsigned size)
+{
+   linear_header *first = LINEAR_PARENT_TO_HEADER(parent);
+   linear_header *latest = first->latest;
+   linear_header *new_node;
+   linear_size_chunk *ptr;
+   unsigned full_size;
+
+   assert(first->magic == LMAGIC);
+   assert(!latest->next);
+
+   size = ALIGN_POT(size, SUBALLOC_ALIGNMENT);
+   full_size = sizeof(linear_size_chunk) + size;
+
+   if (unlikely(latest->offset + full_size > latest->size)) {
+      /* allocate a new node */
+      new_node = create_linear_node(latest->ralloc_parent, size);
+      if (unlikely(!new_node))
+         return NULL;
+
+      first->latest = new_node;
+      latest->latest = new_node;
+      latest->next = new_node;
+      latest = new_node;
+   }
+
+   ptr = (linear_size_chunk *)((char*)&latest[1] + latest->offset);
+   ptr->size = size;
+   latest->offset += full_size;
+
+   assert((uintptr_t)&ptr[1] % SUBALLOC_ALIGNMENT == 0);
+   return &ptr[1];
+}
+
+void *
+linear_alloc_parent(void *ralloc_ctx, unsigned size)
+{
+   linear_header *node;
+
+   if (unlikely(!ralloc_ctx))
+      return NULL;
+
+   size = ALIGN_POT(size, SUBALLOC_ALIGNMENT);
+
+   node = create_linear_node(ralloc_ctx, size);
+   if (unlikely(!node))
+      return NULL;
+
+   return linear_alloc_child((char*)node +
+                             sizeof(linear_header) +
+                             sizeof(linear_size_chunk), size);
+}
+
+void *
+linear_zalloc_child(void *parent, unsigned size)
+{
+   void *ptr = linear_alloc_child(parent, size);
+
+   if (likely(ptr))
+      memset(ptr, 0, size);
+   return ptr;
+}
+
+void *
+linear_zalloc_parent(void *parent, unsigned size)
+{
+   void *ptr = linear_alloc_parent(parent, size);
+
+   if (likely(ptr))
+      memset(ptr, 0, size);
+   return ptr;
+}
+
+void
+linear_free_parent(void *ptr)
+{
+   linear_header *node;
+
+   if (unlikely(!ptr))
+      return;
+
+   node = LINEAR_PARENT_TO_HEADER(ptr);
+   assert(node->magic == LMAGIC);
+
+   while (node) {
+      void *ptr = node;
+
+      node = node->next;
+      ralloc_free(ptr);
+   }
+}
+
+void
+ralloc_steal_linear_parent(void *new_ralloc_ctx, void *ptr)
+{
+   linear_header *node;
+
+   if (unlikely(!ptr))
+      return;
+
+   node = LINEAR_PARENT_TO_HEADER(ptr);
+   assert(node->magic == LMAGIC);
+
+   while (node) {
+      ralloc_steal(new_ralloc_ctx, node);
+      node->ralloc_parent = new_ralloc_ctx;
+      node = node->next;
+   }
+}
+
+void *
+ralloc_parent_of_linear_parent(void *ptr)
+{
+   linear_header *node = LINEAR_PARENT_TO_HEADER(ptr);
+   assert(node->magic == LMAGIC);
+   return node->ralloc_parent;
+}
+
+void *
+linear_realloc(void *parent, void *old, unsigned new_size)
+{
+   unsigned old_size = 0;
+   ralloc_header *new_ptr;
+
+   new_ptr = linear_alloc_child(parent, new_size);
+
+   if (unlikely(!old))
+      return new_ptr;
+
+   old_size = ((linear_size_chunk*)old)[-1].size;
+
+   if (likely(new_ptr && old_size))
+      memcpy(new_ptr, old, MIN2(old_size, new_size));
+
+   return new_ptr;
+}
+
+/* All code below is pretty much copied from ralloc and only the alloc
+ * calls are different.
+ */
+
+char *
+linear_strdup(void *parent, const char *str)
+{
+   unsigned n;
+   char *ptr;
+
+   if (unlikely(!str))
+      return NULL;
+
+   n = strlen(str);
+   ptr = linear_alloc_child(parent, n + 1);
+   if (unlikely(!ptr))
+      return NULL;
+
+   memcpy(ptr, str, n);
+   ptr[n] = '\0';
+   return ptr;
+}
+
+char *
+linear_asprintf(void *parent, const char *fmt, ...)
+{
+   char *ptr;
+   va_list args;
+   va_start(args, fmt);
+   ptr = linear_vasprintf(parent, fmt, args);
+   va_end(args);
+   return ptr;
+}
+
+char *
+linear_vasprintf(void *parent, const char *fmt, va_list args)
+{
+   unsigned size = printf_length(fmt, args) + 1;
+
+   char *ptr = linear_alloc_child(parent, size);
+   if (ptr != NULL)
+      vsnprintf(ptr, size, fmt, args);
+
+   return ptr;
+}
+
+bool
+linear_asprintf_append(void *parent, char **str, const char *fmt, ...)
+{
+   bool success;
+   va_list args;
+   va_start(args, fmt);
+   success = linear_vasprintf_append(parent, str, fmt, args);
+   va_end(args);
+   return success;
+}
+
+bool
+linear_vasprintf_append(void *parent, char **str, const char *fmt, va_list args)
+{
+   size_t existing_length;
+   assert(str != NULL);
+   existing_length = *str ? strlen(*str) : 0;
+   return linear_vasprintf_rewrite_tail(parent, str, &existing_length, fmt, args);
+}
+
+bool
+linear_asprintf_rewrite_tail(void *parent, char **str, size_t *start,
+                             const char *fmt, ...)
+{
+   bool success;
+   va_list args;
+   va_start(args, fmt);
+   success = linear_vasprintf_rewrite_tail(parent, str, start, fmt, args);
+   va_end(args);
+   return success;
+}
+
+bool
+linear_vasprintf_rewrite_tail(void *parent, char **str, size_t *start,
+                              const char *fmt, va_list args)
+{
+   size_t new_length;
+   char *ptr;
+
+   assert(str != NULL);
+
+   if (unlikely(*str == NULL)) {
+      *str = linear_vasprintf(parent, fmt, args);
+      *start = strlen(*str);
+      return true;
+   }
+
+   new_length = printf_length(fmt, args);
+
+   ptr = linear_realloc(parent, *str, *start + new_length + 1);
+   if (unlikely(ptr == NULL))
+      return false;
+
+   vsnprintf(ptr + *start, new_length + 1, fmt, args);
+   *str = ptr;
+   *start += new_length;
+   return true;
+}
+
+/* helper routine for strcat/strncat - n is the exact amount to copy */
+static bool
+linear_cat(void *parent, char **dest, const char *str, unsigned n)
+{
+   char *both;
+   unsigned existing_length;
+   assert(dest != NULL && *dest != NULL);
+
+   existing_length = strlen(*dest);
+   both = linear_realloc(parent, *dest, existing_length + n + 1);
+   if (unlikely(both == NULL))
+      return false;
+
+   memcpy(both + existing_length, str, n);
+   both[existing_length + n] = '\0';
+
+   *dest = both;
+   return true;
+}
+
+bool
+linear_strcat(void *parent, char **dest, const char *str)
+{
+   return linear_cat(parent, dest, str, strlen(str));
+}
diff --git a/src/mesa/util/ralloc.h b/src/mesa/util/ralloc.h
new file mode 100644
index 00000000..857ca5f7
--- /dev/null
+++ b/src/mesa/util/ralloc.h
@@ -0,0 +1,604 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file ralloc.h
+ *
+ * ralloc: a recursive memory allocator
+ *
+ * The ralloc memory allocator creates a hierarchy of allocated
+ * objects. Every allocation is in reference to some parent, and
+ * every allocated object can in turn be used as the parent of a
+ * subsequent allocation. This allows for extremely convenient
+ * discarding of an entire tree/sub-tree of allocations by calling
+ * ralloc_free on any particular object to free it and all of its
+ * children.
+ *
+ * The conceptual working of ralloc was directly inspired by Andrew
+ * Tridgell's talloc, but ralloc is an independent implementation
+ * released under the MIT license and tuned for Mesa.
+ *
+ * talloc is more sophisticated than ralloc in that it includes reference
+ * counting and useful debugging features.  However, it is released under
+ * a non-permissive open source license.
+ */
+
+#ifndef RALLOC_H
+#define RALLOC_H
+
+#include <stddef.h>
+#include <stdarg.h>
+#include <stdbool.h>
+
+#include "macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \def ralloc(ctx, type)
+ * Allocate a new object chained off of the given context.
+ *
+ * This is equivalent to:
+ * \code
+ * ((type *) ralloc_size(ctx, sizeof(type))
+ * \endcode
+ */
+#define ralloc(ctx, type)  ((type *) ralloc_size(ctx, sizeof(type)))
+
+/**
+ * \def rzalloc(ctx, type)
+ * Allocate a new object out of the given context and initialize it to zero.
+ *
+ * This is equivalent to:
+ * \code
+ * ((type *) rzalloc_size(ctx, sizeof(type))
+ * \endcode
+ */
+#define rzalloc(ctx, type) ((type *) rzalloc_size(ctx, sizeof(type)))
+
+/**
+ * Allocate a new ralloc context.
+ *
+ * While any ralloc'd pointer can be used as a context, sometimes it is useful
+ * to simply allocate a context with no associated memory.
+ *
+ * It is equivalent to:
+ * \code
+ * ((type *) ralloc_size(ctx, 0)
+ * \endcode
+ */
+void *ralloc_context(const void *ctx);
+
+/**
+ * Allocate memory chained off of the given context.
+ *
+ * This is the core allocation routine which is used by all others.  It
+ * simply allocates storage for \p size bytes and returns the pointer,
+ * similar to \c malloc.
+ */
+void *ralloc_size(const void *ctx, size_t size) MALLOCLIKE;
+
+/**
+ * Allocate zero-initialized memory chained off of the given context.
+ *
+ * This is similar to \c calloc with a size of 1.
+ */
+void *rzalloc_size(const void *ctx, size_t size) MALLOCLIKE;
+
+/**
+ * Resize a piece of ralloc-managed memory, preserving data.
+ *
+ * Similar to \c realloc.  Unlike C89, passing 0 for \p size does not free the
+ * memory.  Instead, it resizes it to a 0-byte ralloc context, just like
+ * calling ralloc_size(ctx, 0).  This is different from talloc.
+ *
+ * \param ctx  The context to use for new allocation.  If \p ptr != NULL,
+ *             it must be the same as ralloc_parent(\p ptr).
+ * \param ptr  Pointer to the memory to be resized.  May be NULL.
+ * \param size The amount of memory to allocate, in bytes.
+ */
+void *reralloc_size(const void *ctx, void *ptr, size_t size);
+
+/**
+ * Resize a ralloc-managed array, preserving data and initializing any newly
+ * allocated data to zero.
+ *
+ * Similar to \c realloc.  Unlike C89, passing 0 for \p size does not free the
+ * memory.  Instead, it resizes it to a 0-byte ralloc context, just like
+ * calling ralloc_size(ctx, 0).  This is different from talloc.
+ *
+ * \param ctx        The context to use for new allocation.  If \p ptr != NULL,
+ *                   it must be the same as ralloc_parent(\p ptr).
+ * \param ptr        Pointer to the memory to be resized.  May be NULL.
+ * \param old_size   The amount of memory in the previous allocation, in bytes.
+ * \param new_size   The amount of memory to allocate, in bytes.
+ */
+void *rerzalloc_size(const void *ctx, void *ptr,
+                     size_t old_size, size_t new_size);
+
+/// \defgroup array Array Allocators @{
+
+/**
+ * \def ralloc_array(ctx, type, count)
+ * Allocate an array of objects chained off the given context.
+ *
+ * Similar to \c calloc, but does not initialize the memory to zero.
+ *
+ * More than a convenience function, this also checks for integer overflow when
+ * multiplying \c sizeof(type) and \p count.  This is necessary for security.
+ *
+ * This is equivalent to:
+ * \code
+ * ((type *) ralloc_array_size(ctx, sizeof(type), count)
+ * \endcode
+ */
+#define ralloc_array(ctx, type, count) \
+   ((type *) ralloc_array_size(ctx, sizeof(type), count))
+
+/**
+ * \def rzalloc_array(ctx, type, count)
+ * Allocate a zero-initialized array chained off the given context.
+ *
+ * Similar to \c calloc.
+ *
+ * More than a convenience function, this also checks for integer overflow when
+ * multiplying \c sizeof(type) and \p count.  This is necessary for security.
+ *
+ * This is equivalent to:
+ * \code
+ * ((type *) rzalloc_array_size(ctx, sizeof(type), count)
+ * \endcode
+ */
+#define rzalloc_array(ctx, type, count) \
+   ((type *) rzalloc_array_size(ctx, sizeof(type), count))
+
+/**
+ * \def reralloc(ctx, ptr, type, count)
+ * Resize a ralloc-managed array, preserving data.
+ *
+ * Similar to \c realloc.  Unlike C89, passing 0 for \p size does not free the
+ * memory.  Instead, it resizes it to a 0-byte ralloc context, just like
+ * calling ralloc_size(ctx, 0).  This is different from talloc.
+ *
+ * More than a convenience function, this also checks for integer overflow when
+ * multiplying \c sizeof(type) and \p count.  This is necessary for security.
+ *
+ * \param ctx   The context to use for new allocation.  If \p ptr != NULL,
+ *              it must be the same as ralloc_parent(\p ptr).
+ * \param ptr   Pointer to the array to be resized.  May be NULL.
+ * \param type  The element type.
+ * \param count The number of elements to allocate.
+ */
+#define reralloc(ctx, ptr, type, count) \
+   ((type *) reralloc_array_size(ctx, ptr, sizeof(type), count))
+
+/**
+ * \def rerzalloc(ctx, ptr, type, count)
+ * Resize a ralloc-managed array, preserving data and initializing any newly
+ * allocated data to zero.
+ *
+ * Similar to \c realloc.  Unlike C89, passing 0 for \p size does not free the
+ * memory.  Instead, it resizes it to a 0-byte ralloc context, just like
+ * calling ralloc_size(ctx, 0).  This is different from talloc.
+ *
+ * More than a convenience function, this also checks for integer overflow when
+ * multiplying \c sizeof(type) and \p count.  This is necessary for security.
+ *
+ * \param ctx        The context to use for new allocation.  If \p ptr != NULL,
+ *                   it must be the same as ralloc_parent(\p ptr).
+ * \param ptr        Pointer to the array to be resized.  May be NULL.
+ * \param type       The element type.
+ * \param old_count  The number of elements in the previous allocation.
+ * \param new_count  The number of elements to allocate.
+ */
+#define rerzalloc(ctx, ptr, type, old_count, new_count) \
+   ((type *) rerzalloc_array_size(ctx, ptr, sizeof(type), old_count, new_count))
+
+/**
+ * Allocate memory for an array chained off the given context.
+ *
+ * Similar to \c calloc, but does not initialize the memory to zero.
+ *
+ * More than a convenience function, this also checks for integer overflow when
+ * multiplying \p size and \p count.  This is necessary for security.
+ */
+void *ralloc_array_size(const void *ctx, size_t size, unsigned count) MALLOCLIKE;
+
+/**
+ * Allocate a zero-initialized array chained off the given context.
+ *
+ * Similar to \c calloc.
+ *
+ * More than a convenience function, this also checks for integer overflow when
+ * multiplying \p size and \p count.  This is necessary for security.
+ */
+void *rzalloc_array_size(const void *ctx, size_t size, unsigned count) MALLOCLIKE;
+
+/**
+ * Resize a ralloc-managed array, preserving data.
+ *
+ * Similar to \c realloc.  Unlike C89, passing 0 for \p size does not free the
+ * memory.  Instead, it resizes it to a 0-byte ralloc context, just like
+ * calling ralloc_size(ctx, 0).  This is different from talloc.
+ *
+ * More than a convenience function, this also checks for integer overflow when
+ * multiplying \c sizeof(type) and \p count.  This is necessary for security.
+ *
+ * \param ctx   The context to use for new allocation.  If \p ptr != NULL,
+ *              it must be the same as ralloc_parent(\p ptr).
+ * \param ptr   Pointer to the array to be resized.  May be NULL.
+ * \param size  The size of an individual element.
+ * \param count The number of elements to allocate.
+ *
+ * \return True unless allocation failed.
+ */
+void *reralloc_array_size(const void *ctx, void *ptr, size_t size,
+			  unsigned count);
+
+/**
+ * Resize a ralloc-managed array, preserving data and initializing any newly
+ * allocated data to zero.
+ *
+ * Similar to \c realloc.  Unlike C89, passing 0 for \p size does not free the
+ * memory.  Instead, it resizes it to a 0-byte ralloc context, just like
+ * calling ralloc_size(ctx, 0).  This is different from talloc.
+ *
+ * More than a convenience function, this also checks for integer overflow when
+ * multiplying \c sizeof(type) and \p count.  This is necessary for security.
+ *
+ * \param ctx        The context to use for new allocation.  If \p ptr != NULL,
+ *                   it must be the same as ralloc_parent(\p ptr).
+ * \param ptr        Pointer to the array to be resized.  May be NULL.
+ * \param size       The size of an individual element.
+ * \param old_count  The number of elements in the previous allocation.
+ * \param new_count  The number of elements to allocate.
+ *
+ * \return True unless allocation failed.
+ */
+void *rerzalloc_array_size(const void *ctx, void *ptr, size_t size,
+			   unsigned old_count, unsigned new_count);
+/// @}
+
+/**
+ * Free a piece of ralloc-managed memory.
+ *
+ * This will also free the memory of any children allocated this context.
+ */
+void ralloc_free(void *ptr);
+
+/**
+ * "Steal" memory from one context, changing it to another.
+ *
+ * This changes \p ptr's context to \p new_ctx.  This is quite useful if
+ * memory is allocated out of a temporary context.
+ */
+void ralloc_steal(const void *new_ctx, void *ptr);
+
+/**
+ * Reparent all children from one context to another.
+ *
+ * This effectively calls ralloc_steal(new_ctx, child) for all children of \p old_ctx.
+ */
+void ralloc_adopt(const void *new_ctx, void *old_ctx);
+
+/**
+ * Return the given pointer's ralloc context.
+ */
+void *ralloc_parent(const void *ptr);
+
+/**
+ * Set a callback to occur just before an object is freed.
+ */
+void ralloc_set_destructor(const void *ptr, void(*destructor)(void *));
+
+/// \defgroup array String Functions @{
+/**
+ * Duplicate a string, allocating the memory from the given context.
+ */
+char *ralloc_strdup(const void *ctx, const char *str) MALLOCLIKE;
+
+/**
+ * Duplicate a string, allocating the memory from the given context.
+ *
+ * Like \c strndup, at most \p n characters are copied.  If \p str is longer
+ * than \p n characters, \p n are copied, and a termining \c '\0' byte is added.
+ */
+char *ralloc_strndup(const void *ctx, const char *str, size_t n) MALLOCLIKE;
+
+/**
+ * Concatenate two strings, allocating the necessary space.
+ *
+ * This appends \p str to \p *dest, similar to \c strcat, using ralloc_resize
+ * to expand \p *dest to the appropriate size.  \p dest will be updated to the
+ * new pointer unless allocation fails.
+ *
+ * The result will always be null-terminated.
+ *
+ * \return True unless allocation failed.
+ */
+bool ralloc_strcat(char **dest, const char *str);
+
+/**
+ * Concatenate two strings, allocating the necessary space.
+ *
+ * This appends at most \p n bytes of \p str to \p *dest, using ralloc_resize
+ * to expand \p *dest to the appropriate size.  \p dest will be updated to the
+ * new pointer unless allocation fails.
+ *
+ * The result will always be null-terminated; \p str does not need to be null
+ * terminated if it is longer than \p n.
+ *
+ * \return True unless allocation failed.
+ */
+bool ralloc_strncat(char **dest, const char *str, size_t n);
+
+/**
+ * Concatenate two strings, allocating the necessary space.
+ *
+ * This appends \p n bytes of \p str to \p *dest, using ralloc_resize
+ * to expand \p *dest to the appropriate size.  \p dest will be updated to the
+ * new pointer unless allocation fails.
+ *
+ * The result will always be null-terminated.
+ *
+ * This function differs from ralloc_strcat() and ralloc_strncat() in that it
+ * does not do any strlen() calls which can become costly on large strings.
+ *
+ * \return True unless allocation failed.
+ */
+bool
+ralloc_str_append(char **dest, const char *str,
+                  size_t existing_length, size_t str_size);
+
+/**
+ * Print to a string.
+ *
+ * This is analogous to \c sprintf, but allocates enough space (using \p ctx
+ * as the context) for the resulting string.
+ *
+ * \return The newly allocated string.
+ */
+char *ralloc_asprintf (const void *ctx, const char *fmt, ...) PRINTFLIKE(2, 3) MALLOCLIKE;
+
+/**
+ * Print to a string, given a va_list.
+ *
+ * This is analogous to \c vsprintf, but allocates enough space (using \p ctx
+ * as the context) for the resulting string.
+ *
+ * \return The newly allocated string.
+ */
+char *ralloc_vasprintf(const void *ctx, const char *fmt, va_list args) MALLOCLIKE;
+
+/**
+ * Rewrite the tail of an existing string, starting at a given index.
+ *
+ * Overwrites the contents of *str starting at \p start with newly formatted
+ * text, including a new null-terminator.  Allocates more memory as necessary.
+ *
+ * This can be used to append formatted text when the length of the existing
+ * string is already known, saving a strlen() call.
+ *
+ * \sa ralloc_asprintf_append
+ *
+ * \param str   The string to be updated.
+ * \param start The index to start appending new data at.
+ * \param fmt   A printf-style formatting string
+ *
+ * \p str will be updated to the new pointer unless allocation fails.
+ * \p start will be increased by the length of the newly formatted text.
+ *
+ * \return True unless allocation failed.
+ */
+bool ralloc_asprintf_rewrite_tail(char **str, size_t *start,
+				  const char *fmt, ...)
+				  PRINTFLIKE(3, 4);
+
+/**
+ * Rewrite the tail of an existing string, starting at a given index.
+ *
+ * Overwrites the contents of *str starting at \p start with newly formatted
+ * text, including a new null-terminator.  Allocates more memory as necessary.
+ *
+ * This can be used to append formatted text when the length of the existing
+ * string is already known, saving a strlen() call.
+ *
+ * \sa ralloc_vasprintf_append
+ *
+ * \param str   The string to be updated.
+ * \param start The index to start appending new data at.
+ * \param fmt   A printf-style formatting string
+ * \param args  A va_list containing the data to be formatted
+ *
+ * \p str will be updated to the new pointer unless allocation fails.
+ * \p start will be increased by the length of the newly formatted text.
+ *
+ * \return True unless allocation failed.
+ */
+bool ralloc_vasprintf_rewrite_tail(char **str, size_t *start, const char *fmt,
+				   va_list args);
+
+/**
+ * Append formatted text to the supplied string.
+ *
+ * This is equivalent to
+ * \code
+ * ralloc_asprintf_rewrite_tail(str, strlen(*str), fmt, ...)
+ * \endcode
+ *
+ * \sa ralloc_asprintf
+ * \sa ralloc_asprintf_rewrite_tail
+ * \sa ralloc_strcat
+ *
+ * \p str will be updated to the new pointer unless allocation fails.
+ *
+ * \return True unless allocation failed.
+ */
+bool ralloc_asprintf_append (char **str, const char *fmt, ...)
+			     PRINTFLIKE(2, 3);
+
+/**
+ * Append formatted text to the supplied string, given a va_list.
+ *
+ * This is equivalent to
+ * \code
+ * ralloc_vasprintf_rewrite_tail(str, strlen(*str), fmt, args)
+ * \endcode
+ *
+ * \sa ralloc_vasprintf
+ * \sa ralloc_vasprintf_rewrite_tail
+ * \sa ralloc_strcat
+ *
+ * \p str will be updated to the new pointer unless allocation fails.
+ *
+ * \return True unless allocation failed.
+ */
+bool ralloc_vasprintf_append(char **str, const char *fmt, va_list args);
+/// @}
+
+/**
+ * Declare C++ new and delete operators which use ralloc.
+ *
+ * Placing this macro in the body of a class makes it possible to do:
+ *
+ * TYPE *var = new(mem_ctx) TYPE(...);
+ * delete var;
+ *
+ * which is more idiomatic in C++ than calling ralloc.
+ */
+#define DECLARE_ALLOC_CXX_OPERATORS_TEMPLATE(TYPE, ALLOC_FUNC)           \
+private:                                                                 \
+   static void _ralloc_destructor(void *p)                               \
+   {                                                                     \
+      reinterpret_cast<TYPE *>(p)->TYPE::~TYPE();                        \
+   }                                                                     \
+public:                                                                  \
+   static void* operator new(size_t size, void *mem_ctx)                 \
+   {                                                                     \
+      void *p = ALLOC_FUNC(mem_ctx, size);                               \
+      assert(p != NULL);                                                 \
+      if (!HAS_TRIVIAL_DESTRUCTOR(TYPE))                                 \
+         ralloc_set_destructor(p, _ralloc_destructor);                   \
+      return p;                                                          \
+   }                                                                     \
+                                                                         \
+   static void operator delete(void *p)                                  \
+   {                                                                     \
+      /* The object's destructor is guaranteed to have already been      \
+       * called by the delete operator at this point -- Make sure it's   \
+       * not called again.                                               \
+       */                                                                \
+      if (!HAS_TRIVIAL_DESTRUCTOR(TYPE))                                 \
+         ralloc_set_destructor(p, NULL);                                 \
+      ralloc_free(p);                                                    \
+   }
+
+#define DECLARE_RALLOC_CXX_OPERATORS(type) \
+   DECLARE_ALLOC_CXX_OPERATORS_TEMPLATE(type, ralloc_size)
+
+#define DECLARE_RZALLOC_CXX_OPERATORS(type) \
+   DECLARE_ALLOC_CXX_OPERATORS_TEMPLATE(type, rzalloc_size)
+
+#define DECLARE_LINEAR_ALLOC_CXX_OPERATORS(type) \
+   DECLARE_ALLOC_CXX_OPERATORS_TEMPLATE(type, linear_alloc_child)
+
+#define DECLARE_LINEAR_ZALLOC_CXX_OPERATORS(type) \
+   DECLARE_ALLOC_CXX_OPERATORS_TEMPLATE(type, linear_zalloc_child)
+
+
+/**
+ * Do a fast allocation from the linear buffer, also known as the child node
+ * from the allocator's point of view. It can't be freed directly. You have
+ * to free the parent or the ralloc parent.
+ *
+ * \param parent   parent node of the linear allocator
+ * \param size     size to allocate (max 32 bits)
+ */
+void *linear_alloc_child(void *parent, unsigned size);
+
+/**
+ * Allocate a parent node that will hold linear buffers. The returned
+ * allocation is actually the first child node, but it's also the handle
+ * of the parent node. Use it for all child node allocations.
+ *
+ * \param ralloc_ctx  ralloc context, must not be NULL
+ * \param size        size to allocate (max 32 bits)
+ */
+void *linear_alloc_parent(void *ralloc_ctx, unsigned size);
+
+/**
+ * Same as linear_alloc_child, but also clears memory.
+ */
+void *linear_zalloc_child(void *parent, unsigned size);
+
+/**
+ * Same as linear_alloc_parent, but also clears memory.
+ */
+void *linear_zalloc_parent(void *ralloc_ctx, unsigned size);
+
+/**
+ * Free the linear parent node. This will free all child nodes too.
+ * Freeing the ralloc parent will also free this.
+ */
+void linear_free_parent(void *ptr);
+
+/**
+ * Same as ralloc_steal, but steals the linear parent node.
+ */
+void ralloc_steal_linear_parent(void *new_ralloc_ctx, void *ptr);
+
+/**
+ * Return the ralloc parent of the linear parent node.
+ */
+void *ralloc_parent_of_linear_parent(void *ptr);
+
+/**
+ * Same as realloc except that the linear allocator doesn't free child nodes,
+ * so it's reduced to memory duplication. It's used in places where
+ * reallocation is required. Don't use it often. It's much slower than
+ * realloc.
+ */
+void *linear_realloc(void *parent, void *old, unsigned new_size);
+
+/* The functions below have the same semantics as their ralloc counterparts,
+ * except that they always allocate a linear child node.
+ */
+char *linear_strdup(void *parent, const char *str);
+char *linear_asprintf(void *parent, const char *fmt, ...);
+char *linear_vasprintf(void *parent, const char *fmt, va_list args);
+bool linear_asprintf_append(void *parent, char **str, const char *fmt, ...);
+bool linear_vasprintf_append(void *parent, char **str, const char *fmt,
+                             va_list args);
+bool linear_asprintf_rewrite_tail(void *parent, char **str, size_t *start,
+                                  const char *fmt, ...);
+bool linear_vasprintf_rewrite_tail(void *parent, char **str, size_t *start,
+                                   const char *fmt, va_list args);
+bool linear_strcat(void *parent, char **dest, const char *str);
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif
diff --git a/src/mesa/util/simple_mtx.h b/src/mesa/util/simple_mtx.h
new file mode 100644
index 00000000..2be576ec
--- /dev/null
+++ b/src/mesa/util/simple_mtx.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright © 2015 Intel
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _SIMPLE_MTX_H
+#define _SIMPLE_MTX_H
+
+#include "../util/futex.h"
+
+#include "../c11/threads.h"
+
+#if defined(__GNUC__) && defined(HAVE_LINUX_FUTEX_H)
+
+/* mtx_t - Fast, simple mutex
+ *
+ * While modern pthread mutexes are very fast (implemented using futex), they
+ * still incur a call to an external DSO and overhead of the generality and
+ * features of pthread mutexes.  Most mutexes in mesa only needs lock/unlock,
+ * and the idea here is that we can inline the atomic operation and make the
+ * fast case just two intructions.  Mutexes are subtle and finicky to
+ * implement, so we carefully copy the implementation from Ulrich Dreppers
+ * well-written and well-reviewed paper:
+ *
+ *   "Futexes Are Tricky"
+ *   http://www.akkadia.org/drepper/futex.pdf
+ *
+ * We implement "mutex3", which gives us a mutex that has no syscalls on
+ * uncontended lock or unlock.  Further, the uncontended case boils down to a
+ * locked cmpxchg and an untaken branch, the uncontended unlock is just a
+ * locked decr and an untaken branch.  We use __builtin_expect() to indicate
+ * that contention is unlikely so that gcc will put the contention code out of
+ * the main code flow.
+ *
+ * A fast mutex only supports lock/unlock, can't be recursive or used with
+ * condition variables.
+ */
+
+typedef struct {
+   uint32_t val;
+} simple_mtx_t;
+
+#define _SIMPLE_MTX_INITIALIZER_NP { 0 }
+
+#define _SIMPLE_MTX_INVALID_VALUE 0xd0d0d0d0
+
+static inline void
+simple_mtx_init(simple_mtx_t *mtx, ASSERTED int type)
+{
+   assert(type == mtx_plain);
+
+   mtx->val = 0;
+}
+
+static inline void
+simple_mtx_destroy(ASSERTED simple_mtx_t *mtx)
+{
+#ifndef NDEBUG
+   mtx->val = _SIMPLE_MTX_INVALID_VALUE;
+#endif
+}
+
+static inline void
+simple_mtx_lock(simple_mtx_t *mtx)
+{
+   uint32_t c;
+
+   c = __sync_val_compare_and_swap(&mtx->val, 0, 1);
+
+   assert(c != _SIMPLE_MTX_INVALID_VALUE);
+
+   if (__builtin_expect(c != 0, 0)) {
+      if (c != 2)
+         c = __sync_lock_test_and_set(&mtx->val, 2);
+      while (c != 0) {
+         futex_wait(&mtx->val, 2, NULL);
+         c = __sync_lock_test_and_set(&mtx->val, 2);
+      }
+   }
+}
+
+static inline void
+simple_mtx_unlock(simple_mtx_t *mtx)
+{
+   uint32_t c;
+
+   c = __sync_fetch_and_sub(&mtx->val, 1);
+
+   assert(c != _SIMPLE_MTX_INVALID_VALUE);
+
+   if (__builtin_expect(c != 1, 0)) {
+      mtx->val = 0;
+      futex_wake(&mtx->val, 1);
+   }
+}
+
+#else
+
+typedef mtx_t simple_mtx_t;
+
+#define _SIMPLE_MTX_INITIALIZER_NP _MTX_INITIALIZER_NP
+
+static inline void
+simple_mtx_init(simple_mtx_t *mtx, int type)
+{
+   mtx_init(mtx, type);
+}
+
+static inline void
+simple_mtx_destroy(simple_mtx_t *mtx)
+{
+   mtx_destroy(mtx);
+}
+
+static inline void
+simple_mtx_lock(simple_mtx_t *mtx)
+{
+   mtx_lock(mtx);
+}
+
+static inline void
+simple_mtx_unlock(simple_mtx_t *mtx)
+{
+   mtx_unlock(mtx);
+}
+
+#endif
+
+#endif
diff --git a/src/meson.build b/src/meson.build
new file mode 100644
index 00000000..d5bb28ba
--- /dev/null
+++ b/src/meson.build
@@ -0,0 +1,83 @@
+# Copyright © 2019 Intel Corporation
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+glslang = find_program('glslangValidator')
+
+overlay_shaders = [
+  'overlay.frag',
+  'overlay.vert',
+]
+overlay_spv = []
+foreach s : ['overlay.frag', 'overlay.vert']
+  overlay_spv += custom_target(
+    s + '.spv.h', input : s, output : s + '.spv.h',
+    command : [glslang, '-V', '-x', '-o', '@OUTPUT@', '@INPUT@'])
+endforeach
+
+vklayer_files = files(
+  'overlay.cpp',
+  'overlay_params.c',
+ # 'font_unispace.c',
+)
+
+vklayer_mesa_overlay = shared_library(
+  'MangoHud',
+  util_files,
+  vk_enum_to_str,
+  vklayer_files,
+  overlay_spv,
+  vk_layer_table_helpers,
+  c_args : [
+    pre_args,
+    c_vis_args, 
+    no_override_init_args, 
+    vulkan_wsi_args
+    ],
+  cpp_args : [
+    pre_args,
+    cpp_vis_args, 
+    vulkan_wsi_args
+    ],
+  dependencies : [
+    vulkan_wsi_deps, 
+    libimgui_core_dep, 
+    dep_dl,
+    dep_pthread],
+  include_directories : inc_common,
+  link_args : cc.get_supported_link_arguments(['-Wl,-Bsymbolic-functions', '-Wl,-z,relro']),
+  install : true
+)
+
+install_data(
+  files('mangohud.json'),
+  install_dir : join_paths(get_option('datadir'), 'vulkan', 'implicit_layer.d'),
+)
+
+install_data(
+  files('setup_mangohud.sh'),
+  install_dir: get_option('bindir'),
+)
+
+configure_file(
+  input : files('mesa-overlay-control.py'),
+  output : '@PLAINNAME@',
+  configuration : configuration_data(), # only copy the file
+  install_dir: get_option('bindir'),
+)
diff --git a/src/overlay.cpp b/src/overlay.cpp
new file mode 100644
index 00000000..6b24ec63
--- /dev/null
+++ b/src/overlay.cpp
@@ -0,0 +1,2837 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include <vulkan/vulkan.h>
+#include <vulkan/vk_layer.h>
+
+// #include "git_sha1.h"
+
+#include "imgui.h"
+
+#include "overlay_params.h"
+
+// #include "util/debug.h"
+#include "mesa/util/hash_table.h"
+#include "mesa/util/list.h"
+#include "mesa/util/ralloc.h"
+#include "mesa/util/os_time.h"
+#include "mesa/util/os_socket.h"
+#include "mesa/util/simple_mtx.h"
+
+#include "vk_enum_to_str.h"
+#include "../include/vulkan/vk_util.h"
+
+#include "cpu_gpu.h"
+#include "logging.h"
+#include "keybinds.h"
+
+bool open = false, displayHud = true;
+string gpuString;
+float offset_x, offset_y, hudSpacing;
+int hudFirstRow, hudSecondRow;
+const char* offset_x_env = std::getenv("X_OFFSET");
+const char* offset_y_env = std::getenv("Y_OFFSET");
+string engineName, engineVersion;
+ImFont* font1;
+
+/* Mapped from VkInstace/VkPhysicalDevice */
+struct instance_data {
+   struct vk_instance_dispatch_table vtable;
+   VkInstance instance;
+
+   struct overlay_params params;
+   bool pipeline_statistics_enabled;
+
+   bool first_line_printed;
+
+   int control_client;
+
+   /* Dumping of frame stats to a file has been enabled. */
+   bool capture_enabled;
+
+   /* Dumping of frame stats to a file has been enabled and started. */
+   bool capture_started;
+};
+
+struct frame_stat {
+   uint64_t stats[OVERLAY_PARAM_ENABLED_MAX];
+};
+
+/* Mapped from VkDevice */
+struct queue_data;
+struct device_data {
+   struct instance_data *instance;
+
+   PFN_vkSetDeviceLoaderData set_device_loader_data;
+
+   struct vk_device_dispatch_table vtable;
+   VkPhysicalDevice physical_device;
+   VkDevice device;
+
+   VkPhysicalDeviceProperties properties;
+
+   struct queue_data *graphic_queue;
+
+   struct queue_data **queues;
+   uint32_t n_queues;
+
+   /* For a single frame */
+   struct frame_stat frame_stats;
+};
+
+/* Mapped from VkCommandBuffer */
+struct command_buffer_data {
+   struct device_data *device;
+
+   VkCommandBufferLevel level;
+
+   VkCommandBuffer cmd_buffer;
+   VkQueryPool pipeline_query_pool;
+   VkQueryPool timestamp_query_pool;
+   uint32_t query_index;
+
+   struct frame_stat stats;
+
+   struct list_head link; /* link into queue_data::running_command_buffer */
+};
+
+/* Mapped from VkQueue */
+struct queue_data {
+   struct device_data *device;
+
+   VkQueue queue;
+   VkQueueFlags flags;
+   uint32_t family_index;
+   uint64_t timestamp_mask;
+
+   VkFence queries_fence;
+
+   struct list_head running_command_buffer;
+};
+
+struct overlay_draw {
+   struct list_head link;
+
+   VkCommandBuffer command_buffer;
+
+   VkSemaphore semaphore;
+   VkFence fence;
+
+   VkBuffer vertex_buffer;
+   VkDeviceMemory vertex_buffer_mem;
+   VkDeviceSize vertex_buffer_size;
+
+   VkBuffer index_buffer;
+   VkDeviceMemory index_buffer_mem;
+   VkDeviceSize index_buffer_size;
+};
+
+/* Mapped from VkSwapchainKHR */
+struct swapchain_data {
+   struct device_data *device;
+
+   VkSwapchainKHR swapchain;
+   unsigned width, height;
+   VkFormat format;
+
+   uint32_t n_images;
+   VkImage *images;
+   VkImageView *image_views;
+   VkFramebuffer *framebuffers;
+
+   VkRenderPass render_pass;
+
+   VkDescriptorPool descriptor_pool;
+   VkDescriptorSetLayout descriptor_layout;
+   VkDescriptorSet descriptor_set;
+
+   VkSampler font_sampler;
+
+   VkPipelineLayout pipeline_layout;
+   VkPipeline pipeline;
+
+   VkCommandPool command_pool;
+
+   struct list_head draws; /* List of struct overlay_draw */
+
+   bool font_uploaded;
+   VkImage font_image;
+   VkImageView font_image_view;
+   VkDeviceMemory font_mem;
+   VkBuffer upload_font_buffer;
+   VkDeviceMemory upload_font_buffer_mem;
+
+   /**/
+   ImGuiContext* imgui_context;
+   ImVec2 window_size;
+
+   /**/
+   uint64_t n_frames;
+   uint64_t last_present_time;
+
+   unsigned n_frames_since_update;
+   uint64_t last_fps_update;
+   double fps;
+   double frametime;
+   double frametimeDisplay;
+   const char* cpuString;
+   const char* gpuString;
+
+   enum overlay_param_enabled stat_selector;
+   double time_dividor;
+   struct frame_stat stats_min, stats_max;
+   struct frame_stat frames_stats[200];
+
+   /* Over a single frame */
+   struct frame_stat frame_stats;
+
+   /* Over fps_sampling_period */
+   struct frame_stat accumulated_stats;
+};
+
+static const VkQueryPipelineStatisticFlags overlay_query_flags =
+   VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT |
+   VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
+#define OVERLAY_QUERY_COUNT (11)
+
+static struct hash_table_u64 *vk_object_to_data = NULL;
+static simple_mtx_t vk_object_to_data_mutex = _SIMPLE_MTX_INITIALIZER_NP;
+
+thread_local ImGuiContext* __MesaImGui;
+
+static inline void ensure_vk_object_map(void)
+{
+   if (!vk_object_to_data)
+      vk_object_to_data = _mesa_hash_table_u64_create(NULL);
+}
+
+#define HKEY(obj) ((uint64_t)(obj))
+#define FIND(type, obj) ((type *)find_object_data(HKEY(obj)))
+
+static void *find_object_data(uint64_t obj)
+{
+   simple_mtx_lock(&vk_object_to_data_mutex);
+   ensure_vk_object_map();
+   void *data = _mesa_hash_table_u64_search(vk_object_to_data, obj);
+   simple_mtx_unlock(&vk_object_to_data_mutex);
+   return data;
+}
+
+static void map_object(uint64_t obj, void *data)
+{
+   simple_mtx_lock(&vk_object_to_data_mutex);
+   ensure_vk_object_map();
+   _mesa_hash_table_u64_insert(vk_object_to_data, obj, data);
+   simple_mtx_unlock(&vk_object_to_data_mutex);
+}
+
+static void unmap_object(uint64_t obj)
+{
+   simple_mtx_lock(&vk_object_to_data_mutex);
+   _mesa_hash_table_u64_remove(vk_object_to_data, obj);
+   simple_mtx_unlock(&vk_object_to_data_mutex);
+}
+
+/**/
+
+#define VK_CHECK(expr) \
+   do { \
+      VkResult __result = (expr); \
+      if (__result != VK_SUCCESS) { \
+         fprintf(stderr, "'%s' line %i failed with %s\n", \
+                 #expr, __LINE__, vk_Result_to_str(__result)); \
+      } \
+   } while (0)
+
+/**/
+
+static VkLayerInstanceCreateInfo *get_instance_chain_info(const VkInstanceCreateInfo *pCreateInfo,
+                                                          VkLayerFunction func)
+{
+   vk_foreach_struct(item, pCreateInfo->pNext) {
+      if (item->sType == VK_STRUCTURE_TYPE_LOADER_INSTANCE_CREATE_INFO &&
+          ((VkLayerInstanceCreateInfo *) item)->function == func)
+         return (VkLayerInstanceCreateInfo *) item;
+   }
+   unreachable("instance chain info not found");
+   return NULL;
+}
+
+static VkLayerDeviceCreateInfo *get_device_chain_info(const VkDeviceCreateInfo *pCreateInfo,
+                                                      VkLayerFunction func)
+{
+   vk_foreach_struct(item, pCreateInfo->pNext) {
+      if (item->sType == VK_STRUCTURE_TYPE_LOADER_DEVICE_CREATE_INFO &&
+          ((VkLayerDeviceCreateInfo *) item)->function == func)
+         return (VkLayerDeviceCreateInfo *)item;
+   }
+   unreachable("device chain info not found");
+   return NULL;
+}
+
+static struct VkBaseOutStructure *
+clone_chain(const struct VkBaseInStructure *chain)
+{
+   struct VkBaseOutStructure *head = NULL, *tail = NULL;
+
+   vk_foreach_struct_const(item, chain) {
+      size_t item_size = vk_structure_type_size(item);
+      struct VkBaseOutStructure *new_item =
+         (struct VkBaseOutStructure *)malloc(item_size);;
+
+      memcpy(new_item, item, item_size);
+
+      if (!head)
+         head = new_item;
+      if (tail)
+         tail->pNext = new_item;
+      tail = new_item;
+   }
+
+   return head;
+}
+
+static void
+free_chain(struct VkBaseOutStructure *chain)
+{
+   while (chain) {
+      void *node = chain;
+      chain = chain->pNext;
+      free(node);
+   }
+}
+
+/**/
+
+static struct instance_data *new_instance_data(VkInstance instance)
+{
+   struct instance_data *data = rzalloc(NULL, struct instance_data);
+   data->instance = instance;
+   data->control_client = -1;
+   map_object(HKEY(data->instance), data);
+   return data;
+}
+
+static void destroy_instance_data(struct instance_data *data)
+{
+   if (data->params.output_file)
+      fclose(data->params.output_file);
+   if (data->params.control >= 0)
+      os_socket_close(data->params.control);
+   unmap_object(HKEY(data->instance));
+   ralloc_free(data);
+}
+
+static void instance_data_map_physical_devices(struct instance_data *instance_data,
+                                               bool map)
+{
+   uint32_t physicalDeviceCount = 0;
+   instance_data->vtable.EnumeratePhysicalDevices(instance_data->instance,
+                                                  &physicalDeviceCount,
+                                                  NULL);
+
+   VkPhysicalDevice *physicalDevices = (VkPhysicalDevice *) malloc(sizeof(VkPhysicalDevice) * physicalDeviceCount);
+   instance_data->vtable.EnumeratePhysicalDevices(instance_data->instance,
+                                                  &physicalDeviceCount,
+                                                  physicalDevices);
+
+   for (uint32_t i = 0; i < physicalDeviceCount; i++) {
+      if (map)
+         map_object(HKEY(physicalDevices[i]), instance_data);
+      else
+         unmap_object(HKEY(physicalDevices[i]));
+   }
+
+   free(physicalDevices);
+}
+
+/**/
+static struct device_data *new_device_data(VkDevice device, struct instance_data *instance)
+{
+   struct device_data *data = rzalloc(NULL, struct device_data);
+   data->instance = instance;
+   data->device = device;
+   map_object(HKEY(data->device), data);
+   return data;
+}
+
+static struct queue_data *new_queue_data(VkQueue queue,
+                                         const VkQueueFamilyProperties *family_props,
+                                         uint32_t family_index,
+                                         struct device_data *device_data)
+{
+   struct queue_data *data = rzalloc(device_data, struct queue_data);
+   data->device = device_data;
+   data->queue = queue;
+   data->flags = family_props->queueFlags;
+   data->timestamp_mask = (1ull << family_props->timestampValidBits) - 1;
+   data->family_index = family_index;
+   list_inithead(&data->running_command_buffer);
+   map_object(HKEY(data->queue), data);
+
+   /* Fence synchronizing access to queries on that queue. */
+   VkFenceCreateInfo fence_info = {};
+   fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+   fence_info.flags = VK_FENCE_CREATE_SIGNALED_BIT;
+   VK_CHECK(device_data->vtable.CreateFence(device_data->device,
+                                            &fence_info,
+                                            NULL,
+                                            &data->queries_fence));
+
+   if (data->flags & VK_QUEUE_GRAPHICS_BIT)
+      device_data->graphic_queue = data;
+
+   return data;
+}
+
+static void destroy_queue(struct queue_data *data)
+{
+   struct device_data *device_data = data->device;
+   device_data->vtable.DestroyFence(device_data->device, data->queries_fence, NULL);
+   unmap_object(HKEY(data->queue));
+   ralloc_free(data);
+}
+
+static void device_map_queues(struct device_data *data,
+                              const VkDeviceCreateInfo *pCreateInfo)
+{
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++)
+      data->n_queues += pCreateInfo->pQueueCreateInfos[i].queueCount;
+   data->queues = ralloc_array(data, struct queue_data *, data->n_queues);
+
+   struct instance_data *instance_data = data->instance;
+   uint32_t n_family_props;
+   instance_data->vtable.GetPhysicalDeviceQueueFamilyProperties(data->physical_device,
+                                                                &n_family_props,
+                                                                NULL);
+   VkQueueFamilyProperties *family_props =
+      (VkQueueFamilyProperties *)malloc(sizeof(VkQueueFamilyProperties) * n_family_props);
+   instance_data->vtable.GetPhysicalDeviceQueueFamilyProperties(data->physical_device,
+                                                                &n_family_props,
+                                                                family_props);
+
+   uint32_t queue_index = 0;
+   for (uint32_t i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
+      for (uint32_t j = 0; j < pCreateInfo->pQueueCreateInfos[i].queueCount; j++) {
+         VkQueue queue;
+         data->vtable.GetDeviceQueue(data->device,
+                                     pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex,
+                                     j, &queue);
+
+         VK_CHECK(data->set_device_loader_data(data->device, queue));
+
+         data->queues[queue_index++] =
+            new_queue_data(queue, &family_props[pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex],
+                           pCreateInfo->pQueueCreateInfos[i].queueFamilyIndex, data);
+      }
+   }
+
+   free(family_props);
+}
+
+static void device_unmap_queues(struct device_data *data)
+{
+   for (uint32_t i = 0; i < data->n_queues; i++)
+      destroy_queue(data->queues[i]);
+}
+
+static void destroy_device_data(struct device_data *data)
+{
+   unmap_object(HKEY(data->device));
+   ralloc_free(data);
+}
+
+/**/
+static struct command_buffer_data *new_command_buffer_data(VkCommandBuffer cmd_buffer,
+                                                           VkCommandBufferLevel level,
+                                                           VkQueryPool pipeline_query_pool,
+                                                           VkQueryPool timestamp_query_pool,
+                                                           uint32_t query_index,
+                                                           struct device_data *device_data)
+{
+   struct command_buffer_data *data = rzalloc(NULL, struct command_buffer_data);
+   data->device = device_data;
+   data->cmd_buffer = cmd_buffer;
+   data->level = level;
+   data->pipeline_query_pool = pipeline_query_pool;
+   data->timestamp_query_pool = timestamp_query_pool;
+   data->query_index = query_index;
+   list_inithead(&data->link);
+   map_object(HKEY(data->cmd_buffer), data);
+   return data;
+}
+
+static void destroy_command_buffer_data(struct command_buffer_data *data)
+{
+   unmap_object(HKEY(data->cmd_buffer));
+   list_delinit(&data->link);
+   ralloc_free(data);
+}
+
+/**/
+static struct swapchain_data *new_swapchain_data(VkSwapchainKHR swapchain,
+                                                 struct device_data *device_data)
+{
+   struct instance_data *instance_data = device_data->instance;
+   struct swapchain_data *data = rzalloc(NULL, struct swapchain_data);
+   data->device = device_data;
+   data->swapchain = swapchain;
+   data->window_size = ImVec2(instance_data->params.width, instance_data->params.height);
+   list_inithead(&data->draws);
+   map_object(HKEY(data->swapchain), data);
+   return data;
+}
+
+static void destroy_swapchain_data(struct swapchain_data *data)
+{
+   unmap_object(HKEY(data->swapchain));
+   ralloc_free(data);
+}
+
+struct overlay_draw *get_overlay_draw(struct swapchain_data *data)
+{
+   struct device_data *device_data = data->device;
+   struct overlay_draw *draw = list_is_empty(&data->draws) ?
+      NULL : list_first_entry(&data->draws, struct overlay_draw, link);
+
+   VkSemaphoreCreateInfo sem_info = {};
+   sem_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+
+   if (draw && device_data->vtable.GetFenceStatus(device_data->device, draw->fence) == VK_SUCCESS) {
+      list_del(&draw->link);
+      VK_CHECK(device_data->vtable.ResetFences(device_data->device,
+                                               1, &draw->fence));
+      list_addtail(&draw->link, &data->draws);
+      return draw;
+   }
+
+   draw = rzalloc(data, struct overlay_draw);
+
+   VkCommandBufferAllocateInfo cmd_buffer_info = {};
+   cmd_buffer_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+   cmd_buffer_info.commandPool = data->command_pool;
+   cmd_buffer_info.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+   cmd_buffer_info.commandBufferCount = 1;
+   VK_CHECK(device_data->vtable.AllocateCommandBuffers(device_data->device,
+                                                       &cmd_buffer_info,
+                                                       &draw->command_buffer));
+   VK_CHECK(device_data->set_device_loader_data(device_data->device,
+                                                draw->command_buffer));
+
+
+   VkFenceCreateInfo fence_info = {};
+   fence_info.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+   VK_CHECK(device_data->vtable.CreateFence(device_data->device,
+                                            &fence_info,
+                                            NULL,
+                                            &draw->fence));
+
+   VK_CHECK(device_data->vtable.CreateSemaphore(device_data->device, &sem_info,
+                                                NULL, &draw->semaphore));
+
+   list_addtail(&draw->link, &data->draws);
+
+   return draw;
+}
+
+static const char *param_unit(enum overlay_param_enabled param)
+{
+   switch (param) {
+   case OVERLAY_PARAM_ENABLED_frame_timing:
+   case OVERLAY_PARAM_ENABLED_acquire_timing:
+   case OVERLAY_PARAM_ENABLED_present_timing:
+      return "(us)";
+   case OVERLAY_PARAM_ENABLED_gpu_timing:
+      return "(ns)";
+   default:
+      return "";
+   }
+}
+
+static void parse_command(struct instance_data *instance_data,
+                          const char *cmd, unsigned cmdlen,
+                          const char *param, unsigned paramlen)
+{
+   if (!strncmp(cmd, "capture", cmdlen)) {
+      int value = atoi(param);
+      bool enabled = value > 0;
+
+      if (enabled) {
+         instance_data->capture_enabled = true;
+      } else {
+         instance_data->capture_enabled = false;
+         instance_data->capture_started = false;
+      }
+   }
+}
+
+#define BUFSIZE 4096
+
+/**
+ * This function will process commands through the control file.
+ *
+ * A command starts with a colon, followed by the command, and followed by an
+ * option '=' and a parameter.  It has to end with a semi-colon. A full command
+ * + parameter looks like:
+ *
+ *    :cmd=param;
+ */
+static void process_char(struct instance_data *instance_data, char c)
+{
+   static char cmd[BUFSIZE];
+   static char param[BUFSIZE];
+
+   static unsigned cmdpos = 0;
+   static unsigned parampos = 0;
+   static bool reading_cmd = false;
+   static bool reading_param = false;
+
+   switch (c) {
+   case ':':
+      cmdpos = 0;
+      parampos = 0;
+      reading_cmd = true;
+      reading_param = false;
+      break;
+   case ';':
+      if (!reading_cmd)
+         break;
+      cmd[cmdpos++] = '\0';
+      param[parampos++] = '\0';
+      parse_command(instance_data, cmd, cmdpos, param, parampos);
+      reading_cmd = false;
+      reading_param = false;
+      break;
+   case '=':
+      if (!reading_cmd)
+         break;
+      reading_param = true;
+      break;
+   default:
+      if (!reading_cmd)
+         break;
+
+      if (reading_param) {
+         /* overflow means an invalid parameter */
+         if (parampos >= BUFSIZE - 1) {
+            reading_cmd = false;
+            reading_param = false;
+            break;
+         }
+
+         param[parampos++] = c;
+      } else {
+         /* overflow means an invalid command */
+         if (cmdpos >= BUFSIZE - 1) {
+            reading_cmd = false;
+            break;
+         }
+
+         cmd[cmdpos++] = c;
+      }
+   }
+}
+
+static void control_send(struct instance_data *instance_data,
+                         const char *cmd, unsigned cmdlen,
+                         const char *param, unsigned paramlen)
+{
+   unsigned msglen = 0;
+   char buffer[BUFSIZE];
+
+   assert(cmdlen + paramlen + 3 < BUFSIZE);
+
+   buffer[msglen++] = ':';
+
+   memcpy(&buffer[msglen], cmd, cmdlen);
+   msglen += cmdlen;
+
+   if (paramlen > 0) {
+      buffer[msglen++] = '=';
+      memcpy(&buffer[msglen], param, paramlen);
+      msglen += paramlen;
+      buffer[msglen++] = ';';
+   }
+
+   os_socket_send(instance_data->control_client, buffer, msglen, 0);
+}
+
+static void control_send_connection_string(struct device_data *device_data)
+{
+   struct instance_data *instance_data = device_data->instance;
+
+   const char *controlVersionCmd = "MesaOverlayControlVersion";
+   const char *controlVersionString = "1";
+
+   control_send(instance_data, controlVersionCmd, strlen(controlVersionCmd),
+                controlVersionString, strlen(controlVersionString));
+
+   const char *deviceCmd = "DeviceName";
+   const char *deviceName = device_data->properties.deviceName;
+
+   control_send(instance_data, deviceCmd, strlen(deviceCmd),
+                deviceName, strlen(deviceName));
+
+   const char *mesaVersionCmd = "MesaVersion";
+   const char *mesaVersionString = "Mesa " PACKAGE_VERSION;
+
+   control_send(instance_data, mesaVersionCmd, strlen(mesaVersionCmd),
+                mesaVersionString, strlen(mesaVersionString));
+}
+
+static void control_client_check(struct device_data *device_data)
+{
+   struct instance_data *instance_data = device_data->instance;
+
+   /* Already connected, just return. */
+   if (instance_data->control_client >= 0)
+      return;
+
+   int socket = os_socket_accept(instance_data->params.control);
+   if (socket == -1) {
+      if (errno != EAGAIN && errno != EWOULDBLOCK && errno != ECONNABORTED)
+         fprintf(stderr, "ERROR on socket: %s\n", strerror(errno));
+      return;
+   }
+
+   if (socket >= 0) {
+      os_socket_block(socket, false);
+      instance_data->control_client = socket;
+      control_send_connection_string(device_data);
+   }
+}
+
+static void control_client_disconnected(struct instance_data *instance_data)
+{
+   os_socket_close(instance_data->control_client);
+   instance_data->control_client = -1;
+}
+
+static void process_control_socket(struct instance_data *instance_data)
+{
+   const int client = instance_data->control_client;
+   if (client >= 0) {
+      char buf[BUFSIZE];
+
+      while (true) {
+         ssize_t n = os_socket_recv(client, buf, BUFSIZE, 0);
+
+         if (n == -1) {
+            if (errno == EAGAIN || errno == EWOULDBLOCK) {
+               /* nothing to read, try again later */
+               break;
+            }
+
+            if (errno != ECONNRESET)
+               fprintf(stderr, "ERROR on connection: %s\n", strerror(errno));
+
+            control_client_disconnected(instance_data);
+         } else if (n == 0) {
+            /* recv() returns 0 when the client disconnects */
+            control_client_disconnected(instance_data);
+         }
+
+         for (ssize_t i = 0; i < n; i++) {
+            process_char(instance_data, buf[i]);
+         }
+
+         /* If we try to read BUFSIZE and receive BUFSIZE bytes from the
+          * socket, there's a good chance that there's still more data to be
+          * read, so we will try again. Otherwise, simply be done for this
+          * iteration and try again on the next frame.
+          */
+         if (n < BUFSIZE)
+            break;
+      }
+   }
+}
+
+static void snapshot_swapchain_frame(struct swapchain_data *data)
+{
+   struct device_data *device_data = data->device;
+   struct instance_data *instance_data = device_data->instance;
+   uint32_t f_idx = data->n_frames % ARRAY_SIZE(data->frames_stats);
+   uint64_t now = os_time_get(); /* us */
+
+   if (instance_data->params.control >= 0) {
+      control_client_check(device_data);
+      process_control_socket(instance_data);
+   }
+
+   double elapsed = (double)(now - data->last_fps_update); /* us */
+   elapsedF2 = (double)(now - last_f2_press);
+   elapsedF12 = (double)(now - last_f12_press);
+   fps = 1000000.0f * data->n_frames_since_update / elapsed;
+
+   if (data->last_present_time) {
+      data->frame_stats.stats[OVERLAY_PARAM_ENABLED_frame_timing] =
+         now - data->last_present_time;
+   }
+
+   memset(&data->frames_stats[f_idx], 0, sizeof(data->frames_stats[f_idx]));
+   for (int s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
+      data->frames_stats[f_idx].stats[s] += device_data->frame_stats.stats[s] + data->frame_stats.stats[s];
+      data->accumulated_stats.stats[s] += device_data->frame_stats.stats[s] + data->frame_stats.stats[s];
+   }
+
+   if (elapsedF2 >= 500000 && !mangohud_output_env == NULL){
+     if (key_is_pressed(XK_F2)){
+       last_f2_press = now;
+       log_start = now;
+       loggingOn = !loggingOn;
+
+       if (loggingOn && log_period != 0)
+         pthread_create(&f2, NULL, &logging, NULL);
+
+     }
+   }
+   
+   if (elapsedF12 >= 500000){
+     if (key_is_pressed(XK_F12)){
+       displayHud = !displayHud;
+       last_f12_press = now;
+     }
+   }
+
+   if (!sysInfoFetched) {
+      deviceName = device_data->properties.deviceName;
+      ram =  exec("cat /proc/meminfo | grep 'MemTotal' | awk '{print $2}'");
+      cpu =  exec("cat /proc/cpuinfo | grep 'model name' | tail -n1 | sed 's/^.*: //' | sed 's/([^)]*)/()/g' | tr -d '(/)'");
+      kernel = exec("uname -r");
+      os = exec("cat /etc/*-release | grep 'PRETTY_NAME' | cut -d '=' -f 2-");
+      os.erase(remove( os.begin(), os.end(), '\"' ),os.end());
+      gpu = exec("lspci | grep VGA | head -n1 | awk -vRS=']' -vFS='[' '{print $2}' | sed '/^$/d' | tail -n1");
+      driver = exec("glxinfo | grep 'OpenGL version' | sed 's/^.*: //' | cut -d' ' --output-delimiter=$'\n' -f1- | grep -v '(' | grep -v ')' | tr '\n' ' ' | cut -c 1-");
+      ram.pop_back();
+      cpu.pop_back();
+      kernel.pop_back();
+      os.pop_back();
+      gpu.pop_back();
+      driver.pop_back();
+
+      log_period = (log_period_env) ? std::stoi(log_period_env) : 100;
+
+      if (log_period == 0)
+         out.open("/tmp/mango", ios::out | ios::app);
+
+      if(duration_env)
+		   duration = std::stoi(duration_env);
+      
+      coreCounting();
+      if (deviceName.find("Radeon") != std::string::npos || deviceName.find("AMD") != std::string::npos) {
+         amdGpuFile = fopen("/sys/class/drm/card0/device/gpu_busy_percent", "r");
+         string tempFolder = exec("ls /sys/class/drm/card0/device/hwmon/");
+         tempFolder.pop_back();
+         string tempLocation = "/sys/class/drm/card0/device/hwmon/" + tempFolder + "/temp1_input";
+         amdTempFile = fopen(tempLocation.c_str(), "r");
+      }
+      if (cpu.find("Intel") != std::string::npos){
+         string cpuTempFolder = exec("ls /sys/devices/platform/coretemp.0/hwmon/");
+         cpuTempFolder.pop_back();
+         cpuTempLocation = "/sys/devices/platform/coretemp.0/hwmon/" + cpuTempFolder + "/temp1_input";
+         cpuTempFile = fopen(cpuTempLocation.c_str(), "r");
+      } else {
+         string name;
+         string path;
+         for (size_t i = 0; i < 10; i++)
+         {
+            path = "/sys/class/hwmon/hwmon" + to_string(i) + "/name";
+            name = exec("cat " + path);
+            name.pop_back();
+            if (name == "k10temp" || name == "zenpower"){
+               cpuTempLocation = "/sys/class/hwmon/hwmon" + to_string(i) + "/temp1_input";
+               break;
+            }
+         }
+         if (cpuTempLocation.empty()) {
+            cout << "MANGOHUD: Could not find temp location" << endl;
+         } else {
+            cpuTempFile = fopen(cpuTempLocation.c_str(), "r");
+         }
+      }
+      // Adjust height for DXVK/VKD3D version number
+      if (engineName == "DXVK" || engineName == "VKD3D"){
+         if (instance_data->params.font_size){
+            instance_data->params.height += instance_data->params.font_size / 2;
+         } else {
+            instance_data->params.height += 24 / 2;
+         }
+      }
+
+      sysInfoFetched = true;
+   }
+
+   /* If capture has been enabled but it hasn't started yet, it means we are on
+    * the first snapshot after it has been enabled. At this point we want to
+    * use the stats captured so far to update the display, but we don't want
+    * this data to cause noise to the stats that we want to capture from now
+    * on.
+    *
+    * capture_begin == true will trigger an update of the fps on display, and a
+    * flush of the data, but no stats will be written to the output file. This
+    * way, we will have only stats from after the capture has been enabled
+    * written to the output_file.
+    */
+   const bool capture_begin =
+      instance_data->capture_enabled && !instance_data->capture_started;
+
+   if (data->last_fps_update) {
+      if (capture_begin ||
+          elapsed >= instance_data->params.fps_sampling_period) {
+            updateCpuStrings();
+            pthread_create(&cpuThread, NULL, &getCpuUsage, NULL);
+            data->cpuString = cpuArray[0].output.c_str();
+            pthread_create(&cpuInfoThread, NULL, &cpuInfo, NULL);
+            
+            // get gpu usage
+            if (deviceName.find("GeForce") != std::string::npos)
+               pthread_create(&nvidiaSmiThread, NULL, &queryNvidiaSmi, NULL);
+
+            if (deviceName.find("Radeon") != std::string::npos || deviceName.find("AMD") != std::string::npos)
+              pthread_create(&gpuThread, NULL, &getAmdGpuUsage, NULL);
+
+            // update variables for logging
+            cpuLoadLog = cpuArray[0].value;
+            gpuLoadLog = gpuLoad;
+
+            data->frametimeDisplay = data->frametime;
+            data->fps = fps;
+         if (instance_data->capture_started) {
+            if (!instance_data->first_line_printed) {
+               bool first_column = true;
+
+               instance_data->first_line_printed = true;
+
+#define OVERLAY_PARAM_BOOL(name) \
+               if (instance_data->params.enabled[OVERLAY_PARAM_ENABLED_##name]) { \
+                  fprintf(instance_data->params.output_file, \
+                          "%s%s%s", first_column ? "" : ", ", #name, \
+                          param_unit(OVERLAY_PARAM_ENABLED_##name)); \
+                  first_column = false; \
+               }
+#define OVERLAY_PARAM_CUSTOM(name)
+               OVERLAY_PARAMS
+#undef OVERLAY_PARAM_BOOL
+#undef OVERLAY_PARAM_CUSTOM
+               fprintf(instance_data->params.output_file, "\n");
+            }
+
+            for (int s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
+               if (!instance_data->params.enabled[s])
+                  continue;
+               if (s == OVERLAY_PARAM_ENABLED_fps) {
+                  fprintf(instance_data->params.output_file,
+                          "%s%.2f", s == 0 ? "" : ", ", data->fps);
+               } else {
+                  fprintf(instance_data->params.output_file,
+                          "%s%" PRIu64, s == 0 ? "" : ", ",
+                          data->accumulated_stats.stats[s]);
+               }
+            }
+            fprintf(instance_data->params.output_file, "\n");
+            fflush(instance_data->params.output_file);
+         }
+
+         memset(&data->accumulated_stats, 0, sizeof(data->accumulated_stats));
+         data->n_frames_since_update = 0;
+         data->last_fps_update = now;
+
+         if (capture_begin)
+            instance_data->capture_started = true;
+      }
+   } else {
+      data->last_fps_update = now;
+   }
+
+   memset(&device_data->frame_stats, 0, sizeof(device_data->frame_stats));
+   memset(&data->frame_stats, 0, sizeof(device_data->frame_stats));
+
+   data->last_present_time = now;
+   data->n_frames++;
+   data->n_frames_since_update++;
+}
+
+static float get_time_stat(void *_data, int _idx)
+{
+   struct swapchain_data *data = (struct swapchain_data *) _data;
+   if ((ARRAY_SIZE(data->frames_stats) - _idx) > data->n_frames)
+      return 0.0f;
+   int idx = ARRAY_SIZE(data->frames_stats) +
+      data->n_frames < ARRAY_SIZE(data->frames_stats) ?
+      _idx - data->n_frames :
+      _idx + data->n_frames;
+   idx %= ARRAY_SIZE(data->frames_stats);
+   /* Time stats are in us. */
+   return data->frames_stats[idx].stats[data->stat_selector] / data->time_dividor;
+}
+
+static float get_stat(void *_data, int _idx)
+{
+   struct swapchain_data *data = (struct swapchain_data *) _data;
+   if ((ARRAY_SIZE(data->frames_stats) - _idx) > data->n_frames)
+      return 0.0f;
+   int idx = ARRAY_SIZE(data->frames_stats) +
+      data->n_frames < ARRAY_SIZE(data->frames_stats) ?
+      _idx - data->n_frames :
+      _idx + data->n_frames;
+   idx %= ARRAY_SIZE(data->frames_stats);
+   return data->frames_stats[idx].stats[data->stat_selector];
+}
+
+static void position_layer(struct swapchain_data *data)
+
+{
+   struct device_data *device_data = data->device;
+   struct instance_data *instance_data = device_data->instance;
+   float margin = 10.0f;
+   if (!offset_x_env == NULL)
+      margin = 0.0f;
+
+
+   ImGui::SetNextWindowBgAlpha(0.5);
+   ImGui::SetNextWindowSize(ImVec2(instance_data->params.width, instance_data->params.height), ImGuiCond_Always);
+   ImGui::PushStyleVar(ImGuiStyleVar_WindowBorderSize, 0.0f);
+
+   if (!offset_x_env == NULL)
+     offset_x = std::stof(offset_x_env);
+
+   if (!offset_y_env == NULL)
+     offset_y = std::stof(offset_y_env);
+
+   switch (instance_data->params.position) {
+   case LAYER_POSITION_TOP_LEFT:
+      ImGui::SetNextWindowPos(ImVec2(margin + offset_x, margin + offset_y), ImGuiCond_Always);
+      break;
+   case LAYER_POSITION_TOP_RIGHT:
+      ImGui::SetNextWindowPos(ImVec2(data->width - data->window_size.x - margin, margin),
+                              ImGuiCond_Always);
+      break;
+   case LAYER_POSITION_BOTTOM_LEFT:
+      ImGui::SetNextWindowPos(ImVec2(margin, data->height - data->window_size.y - margin),
+                              ImGuiCond_Always);
+      break;
+   case LAYER_POSITION_BOTTOM_RIGHT:
+      ImGui::SetNextWindowPos(ImVec2(data->width - data->window_size.x - margin,
+                                     data->height - data->window_size.y - margin),
+                              ImGuiCond_Always);
+      break;
+   }
+}
+
+static void compute_swapchain_display(struct swapchain_data *data)
+{
+   struct device_data *device_data = data->device;
+   struct instance_data *instance_data = device_data->instance;
+
+   ImGui::SetCurrentContext(data->imgui_context);
+   ImGui::NewFrame();
+   position_layer(data);
+   if (instance_data->params.font_size > 0 && instance_data->params.width == 280)
+      instance_data->params.width = hudFirstRow + hudSecondRow;
+
+   if(displayHud)
+	   ImGui::Begin("Main", &open, ImGuiWindowFlags_NoDecoration);
+
+   if(!displayHud){
+      ImGui::SetNextWindowBgAlpha(0.01);
+      ImGui::Begin("Main", &open, ImGuiWindowFlags_NoDecoration);
+   }
+   
+   if (displayHud){
+      if (deviceName.find("GeForce") != std::string::npos || deviceName.find("Radeon") != std::string::npos || deviceName.find("AMD") != std::string::npos){
+         int gpuloadLength = gpuLoadDisplay.length();
+         ImGui::TextColored(ImVec4(0.0, 0.502, 0.25, 1.00f), "GPU");
+         ImGui::SameLine(hudFirstRow);
+         ImGui::Text("%s%%", gpuLoadDisplay.c_str());
+         // ImGui::SameLine(150);
+         // ImGui::Text("%s", "%");
+         if (instance_data->params.enabled[OVERLAY_PARAM_ENABLED_gpu_temp]){
+            ImGui::SameLine(hudSecondRow);
+            ImGui::Text("%i%s", gpuTemp, "°C");
+         }
+      }    
+      int cpuloadLength = to_string(cpuLoadLog).length();
+      ImGui::TextColored(ImVec4(0.0, 0.502, 0.753, 1.00f), "CPU");
+      ImGui::SameLine(hudFirstRow);
+      ImGui::Text("%d%%", cpuLoadLog);
+      // ImGui::SameLine(150);
+      // ImGui::Text("%s", "%");
+      if (instance_data->params.enabled[OVERLAY_PARAM_ENABLED_cpu_temp]){
+         ImGui::SameLine(hudSecondRow);
+         ImGui::Text("%i%s", cpuTemp, "°C");
+      }
+      
+      if (instance_data->params.enabled[OVERLAY_PARAM_ENABLED_core_load]){
+         for (int i = 0; i < numCpuCores; i++)
+         {
+            int cpuCoreLoadLength = to_string(cpuArray[i + 1].value).length();
+            ImGui::TextColored(ImVec4(0.0, 0.502, 0.753, 1.00f), "CPU");
+            ImGui::SameLine(0, 1.0f);
+            ImGui::PushFont(font1);
+            ImGui::TextColored(ImVec4(0.0, 0.502, 0.753, 1.00f),"%i", i);
+            ImGui::PopFont();
+            ImGui::SameLine(hudFirstRow);
+            ImGui::Text("%i%%", cpuArray[i + 1].value);
+            ImGui::SameLine(hudSecondRow);
+            ImGui::Text("%i", cpuArray[i + 1].freq);
+            ImGui::SameLine(0, 1.0f);
+            ImGui::PushFont(font1);
+            ImGui::Text("MHz");
+            ImGui::PopFont();
+         }
+      }
+      if (instance_data->params.enabled[OVERLAY_PARAM_ENABLED_fps]){
+         int fpsLength = to_string(int(data->fps)).length();
+         int msLength = to_string(1000 / data->fps).length();
+         ImGui::TextColored(ImVec4(0.753, 0.502, 0.502, 1.00f), "%s", engineName.c_str());
+         ImGui::SameLine(hudFirstRow);
+         ImGui::Text("%.0f", data->fps);
+         ImGui::SameLine(0, 1.0f);
+         ImGui::PushFont(font1);
+         ImGui::Text("FPS");
+         ImGui::PopFont();
+         ImGui::SameLine(hudSecondRow);
+         ImGui::Text("%.1f", 1000 / data->fps);
+         ImGui::SameLine(0, 1.0f);
+         ImGui::PushFont(font1);
+         ImGui::Text("ms");
+         ImGui::PopFont();
+         if (engineName == "DXVK" || engineName == "VKD3D"){
+            ImGui::PushFont(font1);
+            ImGui::TextColored(ImVec4(0.753, 0.502, 0.502, 1.00f), "%s", engineVersion.c_str());
+            ImGui::PopFont();
+         }
+      }
+
+      // ImGui::ProgressBar(float(0.5), ImVec2(ImGui::GetContentRegionAvailWidth(), 21), NULL);
+      ImGui::Dummy(ImVec2(0.0f, 20.0f));
+
+      if (loggingOn && log_period == 0){
+         uint64_t now = os_time_get();
+         elapsedLog = (double)(now - log_start);
+         if ((elapsedLog) >= duration * 1000000)
+            loggingOn = false;
+
+         out << fps << "," <<  cpuLoadLog << "," << gpuLoadLog << "," << (now - log_start) << endl;
+      }
+
+      /* Recompute min/max */
+      for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
+         data->stats_min.stats[s] = UINT64_MAX;
+         data->stats_max.stats[s] = 0;
+      }
+      for (uint32_t f = 0; f < MIN2(data->n_frames, ARRAY_SIZE(data->frames_stats)); f++) {
+         for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
+            data->stats_min.stats[s] = MIN2(data->frames_stats[f].stats[s],
+                                          data->stats_min.stats[s]);
+            data->stats_max.stats[s] = MAX2(data->frames_stats[f].stats[s],
+                                          data->stats_max.stats[s]);
+         }
+      }
+      for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
+         assert(data->stats_min.stats[s] != UINT64_MAX);
+      }
+
+      for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++) {
+         if (!instance_data->params.enabled[s] ||
+            s == OVERLAY_PARAM_ENABLED_fps ||
+            s == OVERLAY_PARAM_ENABLED_frame)
+            continue;
+
+         char hash[40];
+         snprintf(hash, sizeof(hash), "##%s", overlay_param_names[s]);
+         data->stat_selector = (enum overlay_param_enabled) s;
+         data->time_dividor = 1000.0f;
+         if (s == OVERLAY_PARAM_ENABLED_gpu_timing)
+            data->time_dividor = 1000000.0f;
+
+         if (s == OVERLAY_PARAM_ENABLED_frame_timing ||
+            s == OVERLAY_PARAM_ENABLED_acquire_timing ||
+            s == OVERLAY_PARAM_ENABLED_present_timing ||
+            s == OVERLAY_PARAM_ENABLED_gpu_timing) {
+            // double min_time = data->stats_min.stats[s] / data->time_dividor;
+            // double max_time = data->stats_max.stats[s] / data->time_dividor;
+            double min_time = 0.0f;
+            double max_time = 50.0f;
+            ImGui::PlotLines(hash, get_time_stat, data,
+                                 ARRAY_SIZE(data->frames_stats), 0,
+                                 NULL, min_time, max_time,
+                                 ImVec2(ImGui::GetContentRegionAvailWidth(), 50));
+            // ImGui::Text("%s: %.3fms [%.3f, %.3f]", overlay_param_names[s],
+            //             get_time_stat(data, ARRAY_SIZE(data->frames_stats) - 1),
+            //             min_time, max_time);
+         } else {
+            ImGui::PlotHistogram(hash, get_stat, data,
+                                 ARRAY_SIZE(data->frames_stats), 0,
+                                 NULL,
+                                 data->stats_min.stats[s],
+                                 data->stats_max.stats[s],
+                                 ImVec2(ImGui::GetContentRegionAvailWidth(), 50));
+            // ImGui::Text("%s: %.0f [%" PRIu64 ", %" PRIu64 "]", overlay_param_names[s],
+            //             get_stat(data, ARRAY_SIZE(data->frames_stats) - 1),
+            //             data->stats_min.stats[s], data->stats_max.stats[s]);
+         }
+      }
+      data->window_size = ImVec2(data->window_size.x, ImGui::GetCursorPosY() + 10.0f);
+   }
+   ImGui::End();
+   if(loggingOn){
+      ImGui::SetNextWindowBgAlpha(0.01);
+      ImGui::SetNextWindowSize(ImVec2(200, 100), ImGuiCond_Always);
+      ImGui::SetNextWindowPos(ImVec2(data->width - 200,
+                                    0),
+                                    ImGuiCond_Always);
+      ImGui::Begin("Logging", &open, ImGuiWindowFlags_NoDecoration);
+      ImGui::Text("Logging...");
+      ImGui::Text("Elapsed: %isec", int((elapsedLog) / 1000000));
+      ImGui::End();
+   }  
+   ImGui::PopStyleVar();
+   ImGui::EndFrame();
+   ImGui::Render();
+
+}
+
+static uint32_t vk_memory_type(struct device_data *data,
+                               VkMemoryPropertyFlags properties,
+                               uint32_t type_bits)
+{
+    VkPhysicalDeviceMemoryProperties prop;
+    data->instance->vtable.GetPhysicalDeviceMemoryProperties(data->physical_device, &prop);
+    for (uint32_t i = 0; i < prop.memoryTypeCount; i++)
+        if ((prop.memoryTypes[i].propertyFlags & properties) == properties && type_bits & (1<<i))
+            return i;
+    return 0xFFFFFFFF; // Unable to find memoryType
+}
+
+static void ensure_swapchain_fonts(struct swapchain_data *data,
+                                   VkCommandBuffer command_buffer)
+{
+   if (data->font_uploaded)
+      return;
+
+   data->font_uploaded = true;
+
+   struct device_data *device_data = data->device;
+   ImGuiIO& io = ImGui::GetIO();
+   unsigned char* pixels;
+   int width, height;
+   io.Fonts->GetTexDataAsRGBA32(&pixels, &width, &height);
+   size_t upload_size = width * height * 4 * sizeof(char);
+
+   /* Upload buffer */
+   VkBufferCreateInfo buffer_info = {};
+   buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+   buffer_info.size = upload_size;
+   buffer_info.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT;
+   buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+   VK_CHECK(device_data->vtable.CreateBuffer(device_data->device, &buffer_info,
+                                             NULL, &data->upload_font_buffer));
+   VkMemoryRequirements upload_buffer_req;
+   device_data->vtable.GetBufferMemoryRequirements(device_data->device,
+                                                   data->upload_font_buffer,
+                                                   &upload_buffer_req);
+   VkMemoryAllocateInfo upload_alloc_info = {};
+   upload_alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+   upload_alloc_info.allocationSize = upload_buffer_req.size;
+   upload_alloc_info.memoryTypeIndex = vk_memory_type(device_data,
+                                                      VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
+                                                      upload_buffer_req.memoryTypeBits);
+   VK_CHECK(device_data->vtable.AllocateMemory(device_data->device,
+                                               &upload_alloc_info,
+                                               NULL,
+                                               &data->upload_font_buffer_mem));
+   VK_CHECK(device_data->vtable.BindBufferMemory(device_data->device,
+                                                 data->upload_font_buffer,
+                                                 data->upload_font_buffer_mem, 0));
+
+   /* Upload to Buffer */
+   char* map = NULL;
+   VK_CHECK(device_data->vtable.MapMemory(device_data->device,
+                                          data->upload_font_buffer_mem,
+                                          0, upload_size, 0, (void**)(&map)));
+   memcpy(map, pixels, upload_size);
+   VkMappedMemoryRange range[1] = {};
+   range[0].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+   range[0].memory = data->upload_font_buffer_mem;
+   range[0].size = upload_size;
+   VK_CHECK(device_data->vtable.FlushMappedMemoryRanges(device_data->device, 1, range));
+   device_data->vtable.UnmapMemory(device_data->device,
+                                   data->upload_font_buffer_mem);
+
+   /* Copy buffer to image */
+   VkImageMemoryBarrier copy_barrier[1] = {};
+   copy_barrier[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+   copy_barrier[0].dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+   copy_barrier[0].oldLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+   copy_barrier[0].newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+   copy_barrier[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+   copy_barrier[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+   copy_barrier[0].image = data->font_image;
+   copy_barrier[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+   copy_barrier[0].subresourceRange.levelCount = 1;
+   copy_barrier[0].subresourceRange.layerCount = 1;
+   device_data->vtable.CmdPipelineBarrier(command_buffer,
+                                          VK_PIPELINE_STAGE_HOST_BIT,
+                                          VK_PIPELINE_STAGE_TRANSFER_BIT,
+                                          0, 0, NULL, 0, NULL,
+                                          1, copy_barrier);
+
+   VkBufferImageCopy region = {};
+   region.imageSubresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+   region.imageSubresource.layerCount = 1;
+   region.imageExtent.width = width;
+   region.imageExtent.height = height;
+   region.imageExtent.depth = 1;
+   device_data->vtable.CmdCopyBufferToImage(command_buffer,
+                                            data->upload_font_buffer,
+                                            data->font_image,
+                                            VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
+                                            1, &region);
+
+   VkImageMemoryBarrier use_barrier[1] = {};
+   use_barrier[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+   use_barrier[0].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+   use_barrier[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+   use_barrier[0].oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL;
+   use_barrier[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+   use_barrier[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+   use_barrier[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+   use_barrier[0].image = data->font_image;
+   use_barrier[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+   use_barrier[0].subresourceRange.levelCount = 1;
+   use_barrier[0].subresourceRange.layerCount = 1;
+   device_data->vtable.CmdPipelineBarrier(command_buffer,
+                                          VK_PIPELINE_STAGE_TRANSFER_BIT,
+                                          VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT,
+                                          0,
+                                          0, NULL,
+                                          0, NULL,
+                                          1, use_barrier);
+
+   /* Store our identifier */
+   io.Fonts->TexID = (ImTextureID)(intptr_t)data->font_image;
+}
+
+static void CreateOrResizeBuffer(struct device_data *data,
+                                 VkBuffer *buffer,
+                                 VkDeviceMemory *buffer_memory,
+                                 VkDeviceSize *buffer_size,
+                                 size_t new_size, VkBufferUsageFlagBits usage)
+{
+    if (*buffer != VK_NULL_HANDLE)
+        data->vtable.DestroyBuffer(data->device, *buffer, NULL);
+    if (*buffer_memory)
+        data->vtable.FreeMemory(data->device, *buffer_memory, NULL);
+
+    VkBufferCreateInfo buffer_info = {};
+    buffer_info.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    buffer_info.size = new_size;
+    buffer_info.usage = usage;
+    buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    VK_CHECK(data->vtable.CreateBuffer(data->device, &buffer_info, NULL, buffer));
+
+    VkMemoryRequirements req;
+    data->vtable.GetBufferMemoryRequirements(data->device, *buffer, &req);
+    VkMemoryAllocateInfo alloc_info = {};
+    alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    alloc_info.allocationSize = req.size;
+    alloc_info.memoryTypeIndex =
+       vk_memory_type(data, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, req.memoryTypeBits);
+    VK_CHECK(data->vtable.AllocateMemory(data->device, &alloc_info, NULL, buffer_memory));
+
+    VK_CHECK(data->vtable.BindBufferMemory(data->device, *buffer, *buffer_memory, 0));
+    *buffer_size = new_size;
+}
+
+static struct overlay_draw *render_swapchain_display(struct swapchain_data *data,
+                                                     struct queue_data *present_queue,
+                                                     const VkSemaphore *wait_semaphores,
+                                                     unsigned n_wait_semaphores,
+                                                     unsigned image_index)
+{
+   ImDrawData* draw_data = ImGui::GetDrawData();
+   if (draw_data->TotalVtxCount == 0)
+      return NULL;
+
+   struct device_data *device_data = data->device;
+   struct overlay_draw *draw = get_overlay_draw(data);
+
+   device_data->vtable.ResetCommandBuffer(draw->command_buffer, 0);
+
+   VkRenderPassBeginInfo render_pass_info = {};
+   render_pass_info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+   render_pass_info.renderPass = data->render_pass;
+   render_pass_info.framebuffer = data->framebuffers[image_index];
+   render_pass_info.renderArea.extent.width = data->width;
+   render_pass_info.renderArea.extent.height = data->height;
+
+   VkCommandBufferBeginInfo buffer_begin_info = {};
+   buffer_begin_info.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+
+   device_data->vtable.BeginCommandBuffer(draw->command_buffer, &buffer_begin_info);
+
+   ensure_swapchain_fonts(data, draw->command_buffer);
+
+   /* Bounce the image to display back to color attachment layout for
+    * rendering on top of it.
+    */
+   VkImageMemoryBarrier imb;
+   imb.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+   imb.pNext = nullptr;
+   imb.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+   imb.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+   imb.oldLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+   imb.newLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+   imb.image = data->images[image_index];
+   imb.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+   imb.subresourceRange.baseMipLevel = 0;
+   imb.subresourceRange.levelCount = 1;
+   imb.subresourceRange.baseArrayLayer = 0;
+   imb.subresourceRange.layerCount = 1;
+   imb.srcQueueFamilyIndex = present_queue->family_index;
+   imb.dstQueueFamilyIndex = device_data->graphic_queue->family_index;
+   device_data->vtable.CmdPipelineBarrier(draw->command_buffer,
+                                          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
+                                          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
+                                          0,          /* dependency flags */
+                                          0, nullptr, /* memory barriers */
+                                          0, nullptr, /* buffer memory barriers */
+                                          1, &imb);   /* image memory barriers */
+
+   device_data->vtable.CmdBeginRenderPass(draw->command_buffer, &render_pass_info,
+                                          VK_SUBPASS_CONTENTS_INLINE);
+
+   /* Create/Resize vertex & index buffers */
+   size_t vertex_size = draw_data->TotalVtxCount * sizeof(ImDrawVert);
+   size_t index_size = draw_data->TotalIdxCount * sizeof(ImDrawIdx);
+   if (draw->vertex_buffer_size < vertex_size) {
+      CreateOrResizeBuffer(device_data,
+                           &draw->vertex_buffer,
+                           &draw->vertex_buffer_mem,
+                           &draw->vertex_buffer_size,
+                           vertex_size, VK_BUFFER_USAGE_VERTEX_BUFFER_BIT);
+   }
+   if (draw->index_buffer_size < index_size) {
+      CreateOrResizeBuffer(device_data,
+                           &draw->index_buffer,
+                           &draw->index_buffer_mem,
+                           &draw->index_buffer_size,
+                           index_size, VK_BUFFER_USAGE_INDEX_BUFFER_BIT);
+   }
+
+    /* Upload vertex & index data */
+    ImDrawVert* vtx_dst = NULL;
+    ImDrawIdx* idx_dst = NULL;
+    VK_CHECK(device_data->vtable.MapMemory(device_data->device, draw->vertex_buffer_mem,
+                                           0, vertex_size, 0, (void**)(&vtx_dst)));
+    VK_CHECK(device_data->vtable.MapMemory(device_data->device, draw->index_buffer_mem,
+                                           0, index_size, 0, (void**)(&idx_dst)));
+    for (int n = 0; n < draw_data->CmdListsCount; n++)
+        {
+           const ImDrawList* cmd_list = draw_data->CmdLists[n];
+           memcpy(vtx_dst, cmd_list->VtxBuffer.Data, cmd_list->VtxBuffer.Size * sizeof(ImDrawVert));
+           memcpy(idx_dst, cmd_list->IdxBuffer.Data, cmd_list->IdxBuffer.Size * sizeof(ImDrawIdx));
+           vtx_dst += cmd_list->VtxBuffer.Size;
+           idx_dst += cmd_list->IdxBuffer.Size;
+        }
+    VkMappedMemoryRange range[2] = {};
+    range[0].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+    range[0].memory = draw->vertex_buffer_mem;
+    range[0].size = VK_WHOLE_SIZE;
+    range[1].sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+    range[1].memory = draw->index_buffer_mem;
+    range[1].size = VK_WHOLE_SIZE;
+    VK_CHECK(device_data->vtable.FlushMappedMemoryRanges(device_data->device, 2, range));
+    device_data->vtable.UnmapMemory(device_data->device, draw->vertex_buffer_mem);
+    device_data->vtable.UnmapMemory(device_data->device, draw->index_buffer_mem);
+
+    /* Bind pipeline and descriptor sets */
+    device_data->vtable.CmdBindPipeline(draw->command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS, data->pipeline);
+    VkDescriptorSet desc_set[1] = { data->descriptor_set };
+    device_data->vtable.CmdBindDescriptorSets(draw->command_buffer, VK_PIPELINE_BIND_POINT_GRAPHICS,
+                                              data->pipeline_layout, 0, 1, desc_set, 0, NULL);
+
+    /* Bind vertex & index buffers */
+    VkBuffer vertex_buffers[1] = { draw->vertex_buffer };
+    VkDeviceSize vertex_offset[1] = { 0 };
+    device_data->vtable.CmdBindVertexBuffers(draw->command_buffer, 0, 1, vertex_buffers, vertex_offset);
+    device_data->vtable.CmdBindIndexBuffer(draw->command_buffer, draw->index_buffer, 0, VK_INDEX_TYPE_UINT16);
+
+    /* Setup viewport */
+    VkViewport viewport;
+    viewport.x = 0;
+    viewport.y = 0;
+    viewport.width = draw_data->DisplaySize.x;
+    viewport.height = draw_data->DisplaySize.y;
+    viewport.minDepth = 0.0f;
+    viewport.maxDepth = 1.0f;
+    device_data->vtable.CmdSetViewport(draw->command_buffer, 0, 1, &viewport);
+
+
+    /* Setup scale and translation through push constants :
+     *
+     * Our visible imgui space lies from draw_data->DisplayPos (top left) to
+     * draw_data->DisplayPos+data_data->DisplaySize (bottom right). DisplayMin
+     * is typically (0,0) for single viewport apps.
+     */
+    float scale[2];
+    scale[0] = 2.0f / draw_data->DisplaySize.x;
+    scale[1] = 2.0f / draw_data->DisplaySize.y;
+    float translate[2];
+    translate[0] = -1.0f - draw_data->DisplayPos.x * scale[0];
+    translate[1] = -1.0f - draw_data->DisplayPos.y * scale[1];
+    device_data->vtable.CmdPushConstants(draw->command_buffer, data->pipeline_layout,
+                                         VK_SHADER_STAGE_VERTEX_BIT,
+                                         sizeof(float) * 0, sizeof(float) * 2, scale);
+    device_data->vtable.CmdPushConstants(draw->command_buffer, data->pipeline_layout,
+                                         VK_SHADER_STAGE_VERTEX_BIT,
+                                         sizeof(float) * 2, sizeof(float) * 2, translate);
+
+    // Render the command lists:
+    int vtx_offset = 0;
+    int idx_offset = 0;
+    ImVec2 display_pos = draw_data->DisplayPos;
+    for (int n = 0; n < draw_data->CmdListsCount; n++)
+    {
+        const ImDrawList* cmd_list = draw_data->CmdLists[n];
+        for (int cmd_i = 0; cmd_i < cmd_list->CmdBuffer.Size; cmd_i++)
+        {
+            const ImDrawCmd* pcmd = &cmd_list->CmdBuffer[cmd_i];
+            // Apply scissor/clipping rectangle
+            // FIXME: We could clamp width/height based on clamped min/max values.
+            VkRect2D scissor;
+            scissor.offset.x = (int32_t)(pcmd->ClipRect.x - display_pos.x) > 0 ? (int32_t)(pcmd->ClipRect.x - display_pos.x) : 0;
+            scissor.offset.y = (int32_t)(pcmd->ClipRect.y - display_pos.y) > 0 ? (int32_t)(pcmd->ClipRect.y - display_pos.y) : 0;
+            scissor.extent.width = (uint32_t)(pcmd->ClipRect.z - pcmd->ClipRect.x);
+            scissor.extent.height = (uint32_t)(pcmd->ClipRect.w - pcmd->ClipRect.y + 1); // FIXME: Why +1 here?
+            device_data->vtable.CmdSetScissor(draw->command_buffer, 0, 1, &scissor);
+
+            // Draw
+            device_data->vtable.CmdDrawIndexed(draw->command_buffer, pcmd->ElemCount, 1, idx_offset, vtx_offset, 0);
+
+            idx_offset += pcmd->ElemCount;
+        }
+        vtx_offset += cmd_list->VtxBuffer.Size;
+    }
+
+   device_data->vtable.CmdEndRenderPass(draw->command_buffer);
+
+   /* Bounce the image to display back to present layout. */
+   imb.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+   imb.pNext = nullptr;
+   imb.srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+   imb.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+   imb.oldLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+   imb.newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+   imb.image = data->images[image_index];
+   imb.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+   imb.subresourceRange.baseMipLevel = 0;
+   imb.subresourceRange.levelCount = 1;
+   imb.subresourceRange.baseArrayLayer = 0;
+   imb.subresourceRange.layerCount = 1;
+   imb.srcQueueFamilyIndex = device_data->graphic_queue->family_index;
+   imb.dstQueueFamilyIndex = present_queue->family_index;
+   device_data->vtable.CmdPipelineBarrier(draw->command_buffer,
+                                          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
+                                          VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT,
+                                          0,          /* dependency flags */
+                                          0, nullptr, /* memory barriers */
+                                          0, nullptr, /* buffer memory barriers */
+                                          1, &imb);   /* image memory barriers */
+
+   device_data->vtable.EndCommandBuffer(draw->command_buffer);
+
+   VkSubmitInfo submit_info = {};
+   VkPipelineStageFlags stage_wait = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+   submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+   submit_info.commandBufferCount = 1;
+   submit_info.pCommandBuffers = &draw->command_buffer;
+   submit_info.pWaitDstStageMask = &stage_wait;
+   submit_info.waitSemaphoreCount = n_wait_semaphores;
+   submit_info.pWaitSemaphores = wait_semaphores;
+   submit_info.signalSemaphoreCount = 1;
+   submit_info.pSignalSemaphores = &draw->semaphore;
+
+   device_data->vtable.QueueSubmit(device_data->graphic_queue->queue, 1, &submit_info, draw->fence);
+
+   return draw;
+}
+
+static const uint32_t overlay_vert_spv[] = {
+#include "overlay.vert.spv.h"
+};
+static const uint32_t overlay_frag_spv[] = {
+#include "overlay.frag.spv.h"
+};
+
+static void setup_swapchain_data_pipeline(struct swapchain_data *data)
+{
+   struct device_data *device_data = data->device;
+   VkShaderModule vert_module, frag_module;
+
+   /* Create shader modules */
+   VkShaderModuleCreateInfo vert_info = {};
+   vert_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+   vert_info.codeSize = sizeof(overlay_vert_spv);
+   vert_info.pCode = overlay_vert_spv;
+   VK_CHECK(device_data->vtable.CreateShaderModule(device_data->device,
+                                                   &vert_info, NULL, &vert_module));
+   VkShaderModuleCreateInfo frag_info = {};
+   frag_info.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+   frag_info.codeSize = sizeof(overlay_frag_spv);
+   frag_info.pCode = (uint32_t*)overlay_frag_spv;
+   VK_CHECK(device_data->vtable.CreateShaderModule(device_data->device,
+                                                   &frag_info, NULL, &frag_module));
+
+   /* Font sampler */
+   VkSamplerCreateInfo sampler_info = {};
+   sampler_info.sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO;
+   sampler_info.magFilter = VK_FILTER_LINEAR;
+   sampler_info.minFilter = VK_FILTER_LINEAR;
+   sampler_info.mipmapMode = VK_SAMPLER_MIPMAP_MODE_LINEAR;
+   sampler_info.addressModeU = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+   sampler_info.addressModeV = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+   sampler_info.addressModeW = VK_SAMPLER_ADDRESS_MODE_REPEAT;
+   sampler_info.minLod = -1000;
+   sampler_info.maxLod = 1000;
+   sampler_info.maxAnisotropy = 1.0f;
+   VK_CHECK(device_data->vtable.CreateSampler(device_data->device, &sampler_info,
+                                              NULL, &data->font_sampler));
+
+   /* Descriptor pool */
+   VkDescriptorPoolSize sampler_pool_size = {};
+   sampler_pool_size.type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+   sampler_pool_size.descriptorCount = 1;
+   VkDescriptorPoolCreateInfo desc_pool_info = {};
+   desc_pool_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+   desc_pool_info.maxSets = 1;
+   desc_pool_info.poolSizeCount = 1;
+   desc_pool_info.pPoolSizes = &sampler_pool_size;
+   VK_CHECK(device_data->vtable.CreateDescriptorPool(device_data->device,
+                                                     &desc_pool_info,
+                                                     NULL, &data->descriptor_pool));
+
+   /* Descriptor layout */
+   VkSampler sampler[1] = { data->font_sampler };
+   VkDescriptorSetLayoutBinding binding[1] = {};
+   binding[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+   binding[0].descriptorCount = 1;
+   binding[0].stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT;
+   binding[0].pImmutableSamplers = sampler;
+   VkDescriptorSetLayoutCreateInfo set_layout_info = {};
+   set_layout_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+   set_layout_info.bindingCount = 1;
+   set_layout_info.pBindings = binding;
+   VK_CHECK(device_data->vtable.CreateDescriptorSetLayout(device_data->device,
+                                                          &set_layout_info,
+                                                          NULL, &data->descriptor_layout));
+
+   /* Descriptor set */
+   VkDescriptorSetAllocateInfo alloc_info = {};
+   alloc_info.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+   alloc_info.descriptorPool = data->descriptor_pool;
+   alloc_info.descriptorSetCount = 1;
+   alloc_info.pSetLayouts = &data->descriptor_layout;
+   VK_CHECK(device_data->vtable.AllocateDescriptorSets(device_data->device,
+                                                       &alloc_info,
+                                                       &data->descriptor_set));
+
+   /* Constants: we are using 'vec2 offset' and 'vec2 scale' instead of a full
+    * 3d projection matrix
+    */
+   VkPushConstantRange push_constants[1] = {};
+   push_constants[0].stageFlags = VK_SHADER_STAGE_VERTEX_BIT;
+   push_constants[0].offset = sizeof(float) * 0;
+   push_constants[0].size = sizeof(float) * 4;
+   VkPipelineLayoutCreateInfo layout_info = {};
+   layout_info.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+   layout_info.setLayoutCount = 1;
+   layout_info.pSetLayouts = &data->descriptor_layout;
+   layout_info.pushConstantRangeCount = 1;
+   layout_info.pPushConstantRanges = push_constants;
+   VK_CHECK(device_data->vtable.CreatePipelineLayout(device_data->device,
+                                                     &layout_info,
+                                                     NULL, &data->pipeline_layout));
+
+   VkPipelineShaderStageCreateInfo stage[2] = {};
+   stage[0].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+   stage[0].stage = VK_SHADER_STAGE_VERTEX_BIT;
+   stage[0].module = vert_module;
+   stage[0].pName = "main";
+   stage[1].sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+   stage[1].stage = VK_SHADER_STAGE_FRAGMENT_BIT;
+   stage[1].module = frag_module;
+   stage[1].pName = "main";
+
+   VkVertexInputBindingDescription binding_desc[1] = {};
+   binding_desc[0].stride = sizeof(ImDrawVert);
+   binding_desc[0].inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+   VkVertexInputAttributeDescription attribute_desc[3] = {};
+   attribute_desc[0].location = 0;
+   attribute_desc[0].binding = binding_desc[0].binding;
+   attribute_desc[0].format = VK_FORMAT_R32G32_SFLOAT;
+   attribute_desc[0].offset = IM_OFFSETOF(ImDrawVert, pos);
+   attribute_desc[1].location = 1;
+   attribute_desc[1].binding = binding_desc[0].binding;
+   attribute_desc[1].format = VK_FORMAT_R32G32_SFLOAT;
+   attribute_desc[1].offset = IM_OFFSETOF(ImDrawVert, uv);
+   attribute_desc[2].location = 2;
+   attribute_desc[2].binding = binding_desc[0].binding;
+   attribute_desc[2].format = VK_FORMAT_R8G8B8A8_UNORM;
+   attribute_desc[2].offset = IM_OFFSETOF(ImDrawVert, col);
+
+   VkPipelineVertexInputStateCreateInfo vertex_info = {};
+   vertex_info.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+   vertex_info.vertexBindingDescriptionCount = 1;
+   vertex_info.pVertexBindingDescriptions = binding_desc;
+   vertex_info.vertexAttributeDescriptionCount = 3;
+   vertex_info.pVertexAttributeDescriptions = attribute_desc;
+
+   VkPipelineInputAssemblyStateCreateInfo ia_info = {};
+   ia_info.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+   ia_info.topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST;
+
+   VkPipelineViewportStateCreateInfo viewport_info = {};
+   viewport_info.sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO;
+   viewport_info.viewportCount = 1;
+   viewport_info.scissorCount = 1;
+
+   VkPipelineRasterizationStateCreateInfo raster_info = {};
+   raster_info.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+   raster_info.polygonMode = VK_POLYGON_MODE_FILL;
+   raster_info.cullMode = VK_CULL_MODE_NONE;
+   raster_info.frontFace = VK_FRONT_FACE_COUNTER_CLOCKWISE;
+   raster_info.lineWidth = 1.0f;
+
+   VkPipelineMultisampleStateCreateInfo ms_info = {};
+   ms_info.sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO;
+   ms_info.rasterizationSamples = VK_SAMPLE_COUNT_1_BIT;
+
+   VkPipelineColorBlendAttachmentState color_attachment[1] = {};
+   color_attachment[0].blendEnable = VK_TRUE;
+   color_attachment[0].srcColorBlendFactor = VK_BLEND_FACTOR_SRC_ALPHA;
+   color_attachment[0].dstColorBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA;
+   color_attachment[0].colorBlendOp = VK_BLEND_OP_ADD;
+   color_attachment[0].srcAlphaBlendFactor = VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA;
+   color_attachment[0].dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO;
+   color_attachment[0].alphaBlendOp = VK_BLEND_OP_ADD;
+   color_attachment[0].colorWriteMask = VK_COLOR_COMPONENT_R_BIT |
+      VK_COLOR_COMPONENT_G_BIT | VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT;
+
+   VkPipelineDepthStencilStateCreateInfo depth_info = {};
+   depth_info.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+
+   VkPipelineColorBlendStateCreateInfo blend_info = {};
+   blend_info.sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO;
+   blend_info.attachmentCount = 1;
+   blend_info.pAttachments = color_attachment;
+
+   VkDynamicState dynamic_states[2] = { VK_DYNAMIC_STATE_VIEWPORT, VK_DYNAMIC_STATE_SCISSOR };
+   VkPipelineDynamicStateCreateInfo dynamic_state = {};
+   dynamic_state.sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO;
+   dynamic_state.dynamicStateCount = (uint32_t)IM_ARRAYSIZE(dynamic_states);
+   dynamic_state.pDynamicStates = dynamic_states;
+
+   VkGraphicsPipelineCreateInfo info = {};
+   info.sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO;
+   info.flags = 0;
+   info.stageCount = 2;
+   info.pStages = stage;
+   info.pVertexInputState = &vertex_info;
+   info.pInputAssemblyState = &ia_info;
+   info.pViewportState = &viewport_info;
+   info.pRasterizationState = &raster_info;
+   info.pMultisampleState = &ms_info;
+   info.pDepthStencilState = &depth_info;
+   info.pColorBlendState = &blend_info;
+   info.pDynamicState = &dynamic_state;
+   info.layout = data->pipeline_layout;
+   info.renderPass = data->render_pass;
+   VK_CHECK(
+      device_data->vtable.CreateGraphicsPipelines(device_data->device, VK_NULL_HANDLE,
+                                                  1, &info,
+                                                  NULL, &data->pipeline));
+
+   device_data->vtable.DestroyShaderModule(device_data->device, vert_module, NULL);
+   device_data->vtable.DestroyShaderModule(device_data->device, frag_module, NULL);
+
+   ImGuiIO& io = ImGui::GetIO();
+   unsigned char* pixels;
+   int width, height;
+   io.Fonts->GetTexDataAsRGBA32(&pixels, &width, &height);
+
+   /* Font image */
+   VkImageCreateInfo image_info = {};
+   image_info.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO;
+   image_info.imageType = VK_IMAGE_TYPE_2D;
+   image_info.format = VK_FORMAT_R8G8B8A8_UNORM;
+   image_info.extent.width = width;
+   image_info.extent.height = height;
+   image_info.extent.depth = 1;
+   image_info.mipLevels = 1;
+   image_info.arrayLayers = 1;
+   image_info.samples = VK_SAMPLE_COUNT_1_BIT;
+   image_info.tiling = VK_IMAGE_TILING_OPTIMAL;
+   image_info.usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT;
+   image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+   image_info.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+   VK_CHECK(device_data->vtable.CreateImage(device_data->device, &image_info,
+                                            NULL, &data->font_image));
+   VkMemoryRequirements font_image_req;
+   device_data->vtable.GetImageMemoryRequirements(device_data->device,
+                                                  data->font_image, &font_image_req);
+   VkMemoryAllocateInfo image_alloc_info = {};
+   image_alloc_info.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+   image_alloc_info.allocationSize = font_image_req.size;
+   image_alloc_info.memoryTypeIndex = vk_memory_type(device_data,
+                                                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+                                                     font_image_req.memoryTypeBits);
+   VK_CHECK(device_data->vtable.AllocateMemory(device_data->device, &image_alloc_info,
+                                               NULL, &data->font_mem));
+   VK_CHECK(device_data->vtable.BindImageMemory(device_data->device,
+                                                data->font_image,
+                                                data->font_mem, 0));
+
+   /* Font image view */
+   VkImageViewCreateInfo view_info = {};
+   view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+   view_info.image = data->font_image;
+   view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
+   view_info.format = VK_FORMAT_R8G8B8A8_UNORM;
+   view_info.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+   view_info.subresourceRange.levelCount = 1;
+   view_info.subresourceRange.layerCount = 1;
+   VK_CHECK(device_data->vtable.CreateImageView(device_data->device, &view_info,
+                                                NULL, &data->font_image_view));
+
+   /* Descriptor set */
+   VkDescriptorImageInfo desc_image[1] = {};
+   desc_image[0].sampler = data->font_sampler;
+   desc_image[0].imageView = data->font_image_view;
+   desc_image[0].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+   VkWriteDescriptorSet write_desc[1] = {};
+   write_desc[0].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+   write_desc[0].dstSet = data->descriptor_set;
+   write_desc[0].descriptorCount = 1;
+   write_desc[0].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+   write_desc[0].pImageInfo = desc_image;
+   device_data->vtable.UpdateDescriptorSets(device_data->device, 1, write_desc, 0, NULL);
+}
+
+static void setup_swapchain_data(struct swapchain_data *data,
+                                 const VkSwapchainCreateInfoKHR *pCreateInfo)
+{
+   data->width = pCreateInfo->imageExtent.width;
+   data->height = pCreateInfo->imageExtent.height;
+   data->format = pCreateInfo->imageFormat;
+
+   data->imgui_context = ImGui::CreateContext();
+   ImGui::SetCurrentContext(data->imgui_context);
+
+   ImGui::GetIO().IniFilename = NULL;
+   ImGui::GetIO().DisplaySize = ImVec2((float)data->width, (float)data->height);
+
+   struct device_data *device_data = data->device;
+
+   /* Render pass */
+   VkAttachmentDescription attachment_desc = {};
+   attachment_desc.format = pCreateInfo->imageFormat;
+   attachment_desc.samples = VK_SAMPLE_COUNT_1_BIT;
+   attachment_desc.loadOp = VK_ATTACHMENT_LOAD_OP_LOAD;
+   attachment_desc.storeOp = VK_ATTACHMENT_STORE_OP_STORE;
+   attachment_desc.stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE;
+   attachment_desc.stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE;
+   attachment_desc.initialLayout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+   attachment_desc.finalLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR;
+   VkAttachmentReference color_attachment = {};
+   color_attachment.attachment = 0;
+   color_attachment.layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL;
+   VkSubpassDescription subpass = {};
+   subpass.pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS;
+   subpass.colorAttachmentCount = 1;
+   subpass.pColorAttachments = &color_attachment;
+   VkSubpassDependency dependency = {};
+   dependency.srcSubpass = VK_SUBPASS_EXTERNAL;
+   dependency.dstSubpass = 0;
+   dependency.srcStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+   dependency.dstStageMask = VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT;
+   dependency.srcAccessMask = 0;
+   dependency.dstAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT;
+   VkRenderPassCreateInfo render_pass_info = {};
+   render_pass_info.sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO;
+   render_pass_info.attachmentCount = 1;
+   render_pass_info.pAttachments = &attachment_desc;
+   render_pass_info.subpassCount = 1;
+   render_pass_info.pSubpasses = &subpass;
+   render_pass_info.dependencyCount = 1;
+   render_pass_info.pDependencies = &dependency;
+   VK_CHECK(device_data->vtable.CreateRenderPass(device_data->device,
+                                                 &render_pass_info,
+                                                 NULL, &data->render_pass));
+
+   setup_swapchain_data_pipeline(data);
+
+   VK_CHECK(device_data->vtable.GetSwapchainImagesKHR(device_data->device,
+                                                      data->swapchain,
+                                                      &data->n_images,
+                                                      NULL));
+
+   data->images = ralloc_array(data, VkImage, data->n_images);
+   data->image_views = ralloc_array(data, VkImageView, data->n_images);
+   data->framebuffers = ralloc_array(data, VkFramebuffer, data->n_images);
+
+   VK_CHECK(device_data->vtable.GetSwapchainImagesKHR(device_data->device,
+                                                      data->swapchain,
+                                                      &data->n_images,
+                                                      data->images));
+
+   /* Image views */
+   VkImageViewCreateInfo view_info = {};
+   view_info.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+   view_info.viewType = VK_IMAGE_VIEW_TYPE_2D;
+   view_info.format = pCreateInfo->imageFormat;
+   view_info.components.r = VK_COMPONENT_SWIZZLE_R;
+   view_info.components.g = VK_COMPONENT_SWIZZLE_G;
+   view_info.components.b = VK_COMPONENT_SWIZZLE_B;
+   view_info.components.a = VK_COMPONENT_SWIZZLE_A;
+   view_info.subresourceRange = { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 };
+   for (uint32_t i = 0; i < data->n_images; i++) {
+      view_info.image = data->images[i];
+      VK_CHECK(device_data->vtable.CreateImageView(device_data->device,
+                                                   &view_info, NULL,
+                                                   &data->image_views[i]));
+   }
+
+   /* Framebuffers */
+   VkImageView attachment[1];
+   VkFramebufferCreateInfo fb_info = {};
+   fb_info.sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO;
+   fb_info.renderPass = data->render_pass;
+   fb_info.attachmentCount = 1;
+   fb_info.pAttachments = attachment;
+   fb_info.width = data->width;
+   fb_info.height = data->height;
+   fb_info.layers = 1;
+   for (uint32_t i = 0; i < data->n_images; i++) {
+      attachment[0] = data->image_views[i];
+      VK_CHECK(device_data->vtable.CreateFramebuffer(device_data->device, &fb_info,
+                                                     NULL, &data->framebuffers[i]));
+   }
+
+   /* Command buffer pool */
+   VkCommandPoolCreateInfo cmd_buffer_pool_info = {};
+   cmd_buffer_pool_info.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+   cmd_buffer_pool_info.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+   cmd_buffer_pool_info.queueFamilyIndex = device_data->graphic_queue->family_index;
+   VK_CHECK(device_data->vtable.CreateCommandPool(device_data->device,
+                                                  &cmd_buffer_pool_info,
+                                                  NULL, &data->command_pool));
+}
+
+static void shutdown_swapchain_data(struct swapchain_data *data)
+{
+   struct device_data *device_data = data->device;
+
+   list_for_each_entry_safe(struct overlay_draw, draw, &data->draws, link) {
+      device_data->vtable.DestroySemaphore(device_data->device, draw->semaphore, NULL);
+      device_data->vtable.DestroyFence(device_data->device, draw->fence, NULL);
+      device_data->vtable.DestroyBuffer(device_data->device, draw->vertex_buffer, NULL);
+      device_data->vtable.DestroyBuffer(device_data->device, draw->index_buffer, NULL);
+      device_data->vtable.FreeMemory(device_data->device, draw->vertex_buffer_mem, NULL);
+      device_data->vtable.FreeMemory(device_data->device, draw->index_buffer_mem, NULL);
+   }
+
+   for (uint32_t i = 0; i < data->n_images; i++) {
+      device_data->vtable.DestroyImageView(device_data->device, data->image_views[i], NULL);
+      device_data->vtable.DestroyFramebuffer(device_data->device, data->framebuffers[i], NULL);
+   }
+
+   device_data->vtable.DestroyRenderPass(device_data->device, data->render_pass, NULL);
+
+   device_data->vtable.DestroyCommandPool(device_data->device, data->command_pool, NULL);
+
+   device_data->vtable.DestroyPipeline(device_data->device, data->pipeline, NULL);
+   device_data->vtable.DestroyPipelineLayout(device_data->device, data->pipeline_layout, NULL);
+
+   device_data->vtable.DestroyDescriptorPool(device_data->device,
+                                             data->descriptor_pool, NULL);
+   device_data->vtable.DestroyDescriptorSetLayout(device_data->device,
+                                                  data->descriptor_layout, NULL);
+
+   device_data->vtable.DestroySampler(device_data->device, data->font_sampler, NULL);
+   device_data->vtable.DestroyImageView(device_data->device, data->font_image_view, NULL);
+   device_data->vtable.DestroyImage(device_data->device, data->font_image, NULL);
+   device_data->vtable.FreeMemory(device_data->device, data->font_mem, NULL);
+
+   device_data->vtable.DestroyBuffer(device_data->device, data->upload_font_buffer, NULL);
+   device_data->vtable.FreeMemory(device_data->device, data->upload_font_buffer_mem, NULL);
+
+   ImGui::DestroyContext(data->imgui_context);
+}
+
+static struct overlay_draw *before_present(struct swapchain_data *swapchain_data,
+                                           struct queue_data *present_queue,
+                                           const VkSemaphore *wait_semaphores,
+                                           unsigned n_wait_semaphores,
+                                           unsigned imageIndex)
+{
+   struct instance_data *instance_data = swapchain_data->device->instance;
+   struct overlay_draw *draw = NULL;
+
+   snapshot_swapchain_frame(swapchain_data);
+
+   if (!instance_data->params.no_display && swapchain_data->n_frames > 0) {
+      compute_swapchain_display(swapchain_data);
+      draw = render_swapchain_display(swapchain_data, present_queue,
+                                      wait_semaphores, n_wait_semaphores,
+                                      imageIndex);
+   }
+
+   return draw;
+}
+
+static VkResult overlay_CreateSwapchainKHR(
+    VkDevice                                    device,
+    const VkSwapchainCreateInfoKHR*             pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkSwapchainKHR*                             pSwapchain)
+{
+   struct device_data *device_data = FIND(struct device_data, device);
+   VkResult result = device_data->vtable.CreateSwapchainKHR(device, pCreateInfo, pAllocator, pSwapchain);
+   if (result != VK_SUCCESS) return result;
+
+   struct swapchain_data *swapchain_data = new_swapchain_data(*pSwapchain, device_data);
+   setup_swapchain_data(swapchain_data, pCreateInfo);
+   return result;
+}
+
+static void overlay_DestroySwapchainKHR(
+    VkDevice                                    device,
+    VkSwapchainKHR                              swapchain,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, swapchain);
+
+   shutdown_swapchain_data(swapchain_data);
+   swapchain_data->device->vtable.DestroySwapchainKHR(device, swapchain, pAllocator);
+   destroy_swapchain_data(swapchain_data);
+}
+
+static VkResult overlay_QueuePresentKHR(
+    VkQueue                                     queue,
+    const VkPresentInfoKHR*                     pPresentInfo)
+{
+   struct queue_data *queue_data = FIND(struct queue_data, queue);
+   struct device_data *device_data = queue_data->device;
+   struct instance_data *instance_data = device_data->instance;
+   uint32_t query_results[OVERLAY_QUERY_COUNT];
+
+   device_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_frame]++;
+
+   if (list_length(&queue_data->running_command_buffer) > 0) {
+      /* Before getting the query results, make sure the operations have
+       * completed.
+       */
+      VK_CHECK(device_data->vtable.ResetFences(device_data->device,
+                                               1, &queue_data->queries_fence));
+      VK_CHECK(device_data->vtable.QueueSubmit(queue, 0, NULL, queue_data->queries_fence));
+      VK_CHECK(device_data->vtable.WaitForFences(device_data->device,
+                                                 1, &queue_data->queries_fence,
+                                                 VK_FALSE, UINT64_MAX));
+
+      /* Now get the results. */
+      list_for_each_entry_safe(struct command_buffer_data, cmd_buffer_data,
+                               &queue_data->running_command_buffer, link) {
+         list_delinit(&cmd_buffer_data->link);
+
+         if (cmd_buffer_data->pipeline_query_pool) {
+            memset(query_results, 0, sizeof(query_results));
+            VK_CHECK(device_data->vtable.GetQueryPoolResults(device_data->device,
+                                                             cmd_buffer_data->pipeline_query_pool,
+                                                             cmd_buffer_data->query_index, 1,
+                                                             sizeof(uint32_t) * OVERLAY_QUERY_COUNT,
+                                                             query_results, 0, VK_QUERY_RESULT_WAIT_BIT));
+
+            for (uint32_t i = OVERLAY_PARAM_ENABLED_vertices;
+                 i <= OVERLAY_PARAM_ENABLED_compute_invocations; i++) {
+               device_data->frame_stats.stats[i] += query_results[i - OVERLAY_PARAM_ENABLED_vertices];
+            }
+         }
+         if (cmd_buffer_data->timestamp_query_pool) {
+            uint64_t gpu_timestamps[2] = { 0 };
+            VK_CHECK(device_data->vtable.GetQueryPoolResults(device_data->device,
+                                                             cmd_buffer_data->timestamp_query_pool,
+                                                             cmd_buffer_data->query_index * 2, 2,
+                                                             2 * sizeof(uint64_t), gpu_timestamps, sizeof(uint64_t),
+                                                             VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT));
+
+            gpu_timestamps[0] &= queue_data->timestamp_mask;
+            gpu_timestamps[1] &= queue_data->timestamp_mask;
+            device_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_gpu_timing] +=
+               (gpu_timestamps[1] - gpu_timestamps[0]) *
+               device_data->properties.limits.timestampPeriod;
+         }
+      }
+   }
+
+   /* Otherwise we need to add our overlay drawing semaphore to the list of
+    * semaphores to wait on. If we don't do that the presented picture might
+    * be have incomplete overlay drawings.
+    */
+   VkResult result = VK_SUCCESS;
+   if (instance_data->params.no_display) {
+      for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
+         VkSwapchainKHR swapchain = pPresentInfo->pSwapchains[i];
+         struct swapchain_data *swapchain_data =
+            FIND(struct swapchain_data, swapchain);
+
+         before_present(swapchain_data,
+                        queue_data,
+                        pPresentInfo->pWaitSemaphores,
+                        pPresentInfo->waitSemaphoreCount,
+                        pPresentInfo->pImageIndices[i]);
+
+         VkPresentInfoKHR present_info = *pPresentInfo;
+         present_info.swapchainCount = 1;
+         present_info.pSwapchains = &swapchain;
+
+         uint64_t ts0 = os_time_get();
+         result = queue_data->device->vtable.QueuePresentKHR(queue, &present_info);
+         uint64_t ts1 = os_time_get();
+         swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_present_timing] += ts1 - ts0;
+      }
+   } else {
+      for (uint32_t i = 0; i < pPresentInfo->swapchainCount; i++) {
+         VkSwapchainKHR swapchain = pPresentInfo->pSwapchains[i];
+         struct swapchain_data *swapchain_data =
+            FIND(struct swapchain_data, swapchain);
+         VkPresentInfoKHR present_info = *pPresentInfo;
+         present_info.swapchainCount = 1;
+         present_info.pSwapchains = &swapchain;
+
+         uint32_t image_index = pPresentInfo->pImageIndices[i];
+
+         struct overlay_draw *draw = before_present(swapchain_data,
+                                                    queue_data,
+                                                    pPresentInfo->pWaitSemaphores,
+                                                    pPresentInfo->waitSemaphoreCount,
+                                                    image_index);
+
+         /* Because the submission of the overlay draw waits on the semaphores
+          * handed for present, we don't need to have this present operation
+          * wait on them as well, we can just wait on the overlay submission
+          * semaphore.
+          */
+         present_info.pWaitSemaphores = &draw->semaphore;
+         present_info.waitSemaphoreCount = 1;
+
+         uint64_t ts0 = os_time_get();
+         VkResult chain_result = queue_data->device->vtable.QueuePresentKHR(queue, &present_info);
+         uint64_t ts1 = os_time_get();
+         swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_present_timing] += ts1 - ts0;
+         if (pPresentInfo->pResults)
+            pPresentInfo->pResults[i] = chain_result;
+         if (chain_result != VK_SUCCESS && result == VK_SUCCESS)
+            result = chain_result;
+      }
+   }
+   return result;
+}
+
+static VkResult overlay_AcquireNextImageKHR(
+    VkDevice                                    device,
+    VkSwapchainKHR                              swapchain,
+    uint64_t                                    timeout,
+    VkSemaphore                                 semaphore,
+    VkFence                                     fence,
+    uint32_t*                                   pImageIndex)
+{
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, swapchain);
+   struct device_data *device_data = swapchain_data->device;
+
+   uint64_t ts0 = os_time_get();
+   VkResult result = device_data->vtable.AcquireNextImageKHR(device, swapchain, timeout,
+                                                             semaphore, fence, pImageIndex);
+   uint64_t ts1 = os_time_get();
+
+   swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_acquire_timing] += ts1 - ts0;
+   swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_acquire]++;
+
+   return result;
+}
+
+static VkResult overlay_AcquireNextImage2KHR(
+    VkDevice                                    device,
+    const VkAcquireNextImageInfoKHR*            pAcquireInfo,
+    uint32_t*                                   pImageIndex)
+{
+   struct swapchain_data *swapchain_data =
+      FIND(struct swapchain_data, pAcquireInfo->swapchain);
+   struct device_data *device_data = swapchain_data->device;
+
+   uint64_t ts0 = os_time_get();
+   VkResult result = device_data->vtable.AcquireNextImage2KHR(device, pAcquireInfo, pImageIndex);
+   uint64_t ts1 = os_time_get();
+
+   swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_acquire_timing] += ts1 - ts0;
+   swapchain_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_acquire]++;
+
+   return result;
+}
+
+static void overlay_CmdDraw(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    vertexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstVertex,
+    uint32_t                                    firstInstance)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw]++;
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdDraw(commandBuffer, vertexCount, instanceCount,
+                               firstVertex, firstInstance);
+}
+
+static void overlay_CmdDrawIndexed(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    indexCount,
+    uint32_t                                    instanceCount,
+    uint32_t                                    firstIndex,
+    int32_t                                     vertexOffset,
+    uint32_t                                    firstInstance)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed]++;
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdDrawIndexed(commandBuffer, indexCount, instanceCount,
+                                      firstIndex, vertexOffset, firstInstance);
+}
+
+static void overlay_CmdDrawIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indirect]++;
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdDrawIndirect(commandBuffer, buffer, offset, drawCount, stride);
+}
+
+static void overlay_CmdDrawIndexedIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    uint32_t                                    drawCount,
+    uint32_t                                    stride)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed_indirect]++;
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdDrawIndexedIndirect(commandBuffer, buffer, offset, drawCount, stride);
+}
+
+static void overlay_CmdDrawIndirectCount(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indirect_count]++;
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdDrawIndirectCount(commandBuffer, buffer, offset,
+                                            countBuffer, countBufferOffset,
+                                            maxDrawCount, stride);
+}
+
+static void overlay_CmdDrawIndexedIndirectCount(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset,
+    VkBuffer                                    countBuffer,
+    VkDeviceSize                                countBufferOffset,
+    uint32_t                                    maxDrawCount,
+    uint32_t                                    stride)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_draw_indexed_indirect_count]++;
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdDrawIndexedIndirectCount(commandBuffer, buffer, offset,
+                                                   countBuffer, countBufferOffset,
+                                                   maxDrawCount, stride);
+}
+
+static void overlay_CmdDispatch(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    groupCountX,
+    uint32_t                                    groupCountY,
+    uint32_t                                    groupCountZ)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_dispatch]++;
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdDispatch(commandBuffer, groupCountX, groupCountY, groupCountZ);
+}
+
+static void overlay_CmdDispatchIndirect(
+    VkCommandBuffer                             commandBuffer,
+    VkBuffer                                    buffer,
+    VkDeviceSize                                offset)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_dispatch_indirect]++;
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdDispatchIndirect(commandBuffer, buffer, offset);
+}
+
+static void overlay_CmdBindPipeline(
+    VkCommandBuffer                             commandBuffer,
+    VkPipelineBindPoint                         pipelineBindPoint,
+    VkPipeline                                  pipeline)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   switch (pipelineBindPoint) {
+   case VK_PIPELINE_BIND_POINT_GRAPHICS: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_graphics]++; break;
+   case VK_PIPELINE_BIND_POINT_COMPUTE: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_compute]++; break;
+   case VK_PIPELINE_BIND_POINT_RAY_TRACING_NV: cmd_buffer_data->stats.stats[OVERLAY_PARAM_ENABLED_pipeline_raytracing]++; break;
+   default: break;
+   }
+   struct device_data *device_data = cmd_buffer_data->device;
+   device_data->vtable.CmdBindPipeline(commandBuffer, pipelineBindPoint, pipeline);
+}
+
+static VkResult overlay_BeginCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    const VkCommandBufferBeginInfo*             pBeginInfo)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   struct device_data *device_data = cmd_buffer_data->device;
+
+   memset(&cmd_buffer_data->stats, 0, sizeof(cmd_buffer_data->stats));
+
+   /* We don't record any query in secondary command buffers, just make sure
+    * we have the right inheritance.
+    */
+   if (cmd_buffer_data->level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
+      VkCommandBufferBeginInfo *begin_info = (VkCommandBufferBeginInfo *)
+         clone_chain((const struct VkBaseInStructure *)pBeginInfo);
+      VkCommandBufferInheritanceInfo *parent_inhe_info = (VkCommandBufferInheritanceInfo *)
+         vk_find_struct(begin_info, COMMAND_BUFFER_INHERITANCE_INFO);
+      VkCommandBufferInheritanceInfo inhe_info = {
+         VK_STRUCTURE_TYPE_COMMAND_BUFFER_INHERITANCE_INFO,
+         NULL,
+         VK_NULL_HANDLE,
+         0,
+         VK_NULL_HANDLE,
+         VK_FALSE,
+         0,
+         overlay_query_flags,
+      };
+
+      if (parent_inhe_info)
+         parent_inhe_info->pipelineStatistics = overlay_query_flags;
+      else {
+         inhe_info.pNext = begin_info->pNext;
+         begin_info->pNext = &inhe_info;
+      }
+
+      VkResult result = device_data->vtable.BeginCommandBuffer(commandBuffer, pBeginInfo);
+
+      if (!parent_inhe_info)
+         begin_info->pNext = inhe_info.pNext;
+
+      free_chain((struct VkBaseOutStructure *)begin_info);
+
+      return result;
+   }
+
+   /* Otherwise record a begin query as first command. */
+   VkResult result = device_data->vtable.BeginCommandBuffer(commandBuffer, pBeginInfo);
+
+   if (result == VK_SUCCESS) {
+      if (cmd_buffer_data->pipeline_query_pool) {
+         device_data->vtable.CmdResetQueryPool(commandBuffer,
+                                               cmd_buffer_data->pipeline_query_pool,
+                                               cmd_buffer_data->query_index, 1);
+      }
+      if (cmd_buffer_data->timestamp_query_pool) {
+         device_data->vtable.CmdResetQueryPool(commandBuffer,
+                                               cmd_buffer_data->timestamp_query_pool,
+                                               cmd_buffer_data->query_index * 2, 2);
+      }
+      if (cmd_buffer_data->pipeline_query_pool) {
+         device_data->vtable.CmdBeginQuery(commandBuffer,
+                                           cmd_buffer_data->pipeline_query_pool,
+                                           cmd_buffer_data->query_index, 0);
+      }
+      if (cmd_buffer_data->timestamp_query_pool) {
+         device_data->vtable.CmdWriteTimestamp(commandBuffer,
+                                               VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                                               cmd_buffer_data->timestamp_query_pool,
+                                               cmd_buffer_data->query_index * 2);
+      }
+   }
+
+   return result;
+}
+
+static VkResult overlay_EndCommandBuffer(
+    VkCommandBuffer                             commandBuffer)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   struct device_data *device_data = cmd_buffer_data->device;
+
+   if (cmd_buffer_data->timestamp_query_pool) {
+      device_data->vtable.CmdWriteTimestamp(commandBuffer,
+                                            VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                                            cmd_buffer_data->timestamp_query_pool,
+                                            cmd_buffer_data->query_index * 2 + 1);
+   }
+   if (cmd_buffer_data->pipeline_query_pool) {
+      device_data->vtable.CmdEndQuery(commandBuffer,
+                                      cmd_buffer_data->pipeline_query_pool,
+                                      cmd_buffer_data->query_index);
+   }
+
+   return device_data->vtable.EndCommandBuffer(commandBuffer);
+}
+
+static VkResult overlay_ResetCommandBuffer(
+    VkCommandBuffer                             commandBuffer,
+    VkCommandBufferResetFlags                   flags)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   struct device_data *device_data = cmd_buffer_data->device;
+
+   memset(&cmd_buffer_data->stats, 0, sizeof(cmd_buffer_data->stats));
+
+   return device_data->vtable.ResetCommandBuffer(commandBuffer, flags);
+}
+
+static void overlay_CmdExecuteCommands(
+    VkCommandBuffer                             commandBuffer,
+    uint32_t                                    commandBufferCount,
+    const VkCommandBuffer*                      pCommandBuffers)
+{
+   struct command_buffer_data *cmd_buffer_data =
+      FIND(struct command_buffer_data, commandBuffer);
+   struct device_data *device_data = cmd_buffer_data->device;
+
+   /* Add the stats of the executed command buffers to the primary one. */
+   for (uint32_t c = 0; c < commandBufferCount; c++) {
+      struct command_buffer_data *sec_cmd_buffer_data =
+         FIND(struct command_buffer_data, pCommandBuffers[c]);
+
+      for (uint32_t s = 0; s < OVERLAY_PARAM_ENABLED_MAX; s++)
+         cmd_buffer_data->stats.stats[s] += sec_cmd_buffer_data->stats.stats[s];
+   }
+
+   device_data->vtable.CmdExecuteCommands(commandBuffer, commandBufferCount, pCommandBuffers);
+}
+
+static VkResult overlay_AllocateCommandBuffers(
+   VkDevice                           device,
+   const VkCommandBufferAllocateInfo* pAllocateInfo,
+   VkCommandBuffer*                   pCommandBuffers)
+{
+   struct device_data *device_data = FIND(struct device_data, device);
+   VkResult result =
+      device_data->vtable.AllocateCommandBuffers(device, pAllocateInfo, pCommandBuffers);
+   if (result != VK_SUCCESS)
+      return result;
+
+   VkQueryPool pipeline_query_pool = VK_NULL_HANDLE;
+   VkQueryPool timestamp_query_pool = VK_NULL_HANDLE;
+   if (device_data->instance->pipeline_statistics_enabled &&
+       pAllocateInfo->level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
+      VkQueryPoolCreateInfo pool_info = {
+         VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+         NULL,
+         0,
+         VK_QUERY_TYPE_PIPELINE_STATISTICS,
+         pAllocateInfo->commandBufferCount,
+         overlay_query_flags,
+      };
+      VK_CHECK(device_data->vtable.CreateQueryPool(device_data->device, &pool_info,
+                                                   NULL, &pipeline_query_pool));
+   }
+   if (device_data->instance->params.enabled[OVERLAY_PARAM_ENABLED_gpu_timing]) {
+      VkQueryPoolCreateInfo pool_info = {
+         VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO,
+         NULL,
+         0,
+         VK_QUERY_TYPE_TIMESTAMP,
+         pAllocateInfo->commandBufferCount * 2,
+         0,
+      };
+      VK_CHECK(device_data->vtable.CreateQueryPool(device_data->device, &pool_info,
+                                                   NULL, &timestamp_query_pool));
+   }
+
+   for (uint32_t i = 0; i < pAllocateInfo->commandBufferCount; i++) {
+      new_command_buffer_data(pCommandBuffers[i], pAllocateInfo->level,
+                              pipeline_query_pool, timestamp_query_pool,
+                              i, device_data);
+   }
+
+   if (pipeline_query_pool)
+      map_object(HKEY(pipeline_query_pool), (void *)(uintptr_t) pAllocateInfo->commandBufferCount);
+   if (timestamp_query_pool)
+      map_object(HKEY(timestamp_query_pool), (void *)(uintptr_t) pAllocateInfo->commandBufferCount);
+
+   return result;
+}
+
+static void overlay_FreeCommandBuffers(
+   VkDevice               device,
+   VkCommandPool          commandPool,
+   uint32_t               commandBufferCount,
+   const VkCommandBuffer* pCommandBuffers)
+{
+   struct device_data *device_data = FIND(struct device_data, device);
+   for (uint32_t i = 0; i < commandBufferCount; i++) {
+      struct command_buffer_data *cmd_buffer_data =
+         FIND(struct command_buffer_data, pCommandBuffers[i]);
+
+      /* It is legal to free a NULL command buffer*/
+      if (!cmd_buffer_data)
+         continue;
+
+      uint64_t count = (uintptr_t)find_object_data(HKEY(cmd_buffer_data->pipeline_query_pool));
+      if (count == 1) {
+         unmap_object(HKEY(cmd_buffer_data->pipeline_query_pool));
+         device_data->vtable.DestroyQueryPool(device_data->device,
+                                              cmd_buffer_data->pipeline_query_pool, NULL);
+      } else if (count != 0) {
+         map_object(HKEY(cmd_buffer_data->pipeline_query_pool), (void *)(uintptr_t)(count - 1));
+      }
+      count = (uintptr_t)find_object_data(HKEY(cmd_buffer_data->timestamp_query_pool));
+      if (count == 1) {
+         unmap_object(HKEY(cmd_buffer_data->timestamp_query_pool));
+         device_data->vtable.DestroyQueryPool(device_data->device,
+                                              cmd_buffer_data->timestamp_query_pool, NULL);
+      } else if (count != 0) {
+         map_object(HKEY(cmd_buffer_data->timestamp_query_pool), (void *)(uintptr_t)(count - 1));
+      }
+      destroy_command_buffer_data(cmd_buffer_data);
+   }
+
+   device_data->vtable.FreeCommandBuffers(device, commandPool,
+                                          commandBufferCount, pCommandBuffers);
+}
+
+static VkResult overlay_QueueSubmit(
+    VkQueue                                     queue,
+    uint32_t                                    submitCount,
+    const VkSubmitInfo*                         pSubmits,
+    VkFence                                     fence)
+{
+   struct queue_data *queue_data = FIND(struct queue_data, queue);
+   struct device_data *device_data = queue_data->device;
+
+   device_data->frame_stats.stats[OVERLAY_PARAM_ENABLED_submit]++;
+
+   for (uint32_t s = 0; s < submitCount; s++) {
+      for (uint32_t c = 0; c < pSubmits[s].commandBufferCount; c++) {
+         struct command_buffer_data *cmd_buffer_data =
+            FIND(struct command_buffer_data, pSubmits[s].pCommandBuffers[c]);
+
+         /* Merge the submitted command buffer stats into the device. */
+         for (uint32_t st = 0; st < OVERLAY_PARAM_ENABLED_MAX; st++)
+            device_data->frame_stats.stats[st] += cmd_buffer_data->stats.stats[st];
+
+         /* Attach the command buffer to the queue so we remember to read its
+          * pipeline statistics & timestamps at QueuePresent().
+          */
+         if (!cmd_buffer_data->pipeline_query_pool &&
+             !cmd_buffer_data->timestamp_query_pool)
+            continue;
+
+         if (list_is_empty(&cmd_buffer_data->link)) {
+            list_addtail(&cmd_buffer_data->link,
+                         &queue_data->running_command_buffer);
+         } else {
+            fprintf(stderr, "Command buffer submitted multiple times before present.\n"
+                    "This could lead to invalid data.\n");
+         }
+      }
+   }
+
+   return device_data->vtable.QueueSubmit(queue, submitCount, pSubmits, fence);
+}
+
+static VkResult overlay_CreateDevice(
+    VkPhysicalDevice                            physicalDevice,
+    const VkDeviceCreateInfo*                   pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkDevice*                                   pDevice)
+{
+   struct instance_data *instance_data =
+      FIND(struct instance_data, physicalDevice);
+   VkLayerDeviceCreateInfo *chain_info =
+      get_device_chain_info(pCreateInfo, VK_LAYER_LINK_INFO);
+
+   assert(chain_info->u.pLayerInfo);
+   PFN_vkGetInstanceProcAddr fpGetInstanceProcAddr = chain_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
+   PFN_vkGetDeviceProcAddr fpGetDeviceProcAddr = chain_info->u.pLayerInfo->pfnNextGetDeviceProcAddr;
+   PFN_vkCreateDevice fpCreateDevice = (PFN_vkCreateDevice)fpGetInstanceProcAddr(NULL, "vkCreateDevice");
+   if (fpCreateDevice == NULL) {
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   // Advance the link info for the next element on the chain
+   chain_info->u.pLayerInfo = chain_info->u.pLayerInfo->pNext;
+
+   VkPhysicalDeviceFeatures device_features = {};
+   VkDeviceCreateInfo device_info = *pCreateInfo;
+
+   if (pCreateInfo->pEnabledFeatures)
+      device_features = *(pCreateInfo->pEnabledFeatures);
+   if (instance_data->pipeline_statistics_enabled) {
+      device_features.inheritedQueries = true;
+      device_features.pipelineStatisticsQuery = true;
+   }
+   device_info.pEnabledFeatures = &device_features;
+
+
+   VkResult result = fpCreateDevice(physicalDevice, &device_info, pAllocator, pDevice);
+   if (result != VK_SUCCESS) return result;
+
+   struct device_data *device_data = new_device_data(*pDevice, instance_data);
+   device_data->physical_device = physicalDevice;
+   vk_load_device_commands(*pDevice, fpGetDeviceProcAddr, &device_data->vtable);
+
+   instance_data->vtable.GetPhysicalDeviceProperties(device_data->physical_device,
+                                                     &device_data->properties);
+
+   VkLayerDeviceCreateInfo *load_data_info =
+      get_device_chain_info(pCreateInfo, VK_LOADER_DATA_CALLBACK);
+   device_data->set_device_loader_data = load_data_info->u.pfnSetDeviceLoaderData;
+
+   device_map_queues(device_data, pCreateInfo);
+
+   return result;
+}
+
+static void overlay_DestroyDevice(
+    VkDevice                                    device,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   struct device_data *device_data = FIND(struct device_data, device);
+   device_unmap_queues(device_data);
+   device_data->vtable.DestroyDevice(device, pAllocator);
+   destroy_device_data(device_data);
+}
+
+static VkResult overlay_CreateInstance(
+    const VkInstanceCreateInfo*                 pCreateInfo,
+    const VkAllocationCallbacks*                pAllocator,
+    VkInstance*                                 pInstance)
+{
+   VkLayerInstanceCreateInfo *chain_info =
+      get_instance_chain_info(pCreateInfo, VK_LAYER_LINK_INFO);
+
+   engineName = pCreateInfo->pApplicationInfo->pEngineName;
+   if (engineName == "DXVK" || engineName == "vkd3d") {
+      int engineVer = pCreateInfo->pApplicationInfo->engineVersion;
+      engineVersion = to_string(VK_VERSION_MAJOR(engineVer)) + "." + to_string(VK_VERSION_MINOR(engineVer)) + "." + to_string(VK_VERSION_PATCH(engineVer));
+   }
+
+   if (engineName != "DXVK" && engineName != "vkd3d" && engineName != "Feral3D")
+      engineName = "VULKAN";
+
+   if (engineName == "vkd3d")
+      engineName = "VKD3D";
+   
+   assert(chain_info->u.pLayerInfo);
+   PFN_vkGetInstanceProcAddr fpGetInstanceProcAddr =
+      chain_info->u.pLayerInfo->pfnNextGetInstanceProcAddr;
+   PFN_vkCreateInstance fpCreateInstance =
+      (PFN_vkCreateInstance)fpGetInstanceProcAddr(NULL, "vkCreateInstance");
+   if (fpCreateInstance == NULL) {
+      return VK_ERROR_INITIALIZATION_FAILED;
+   }
+
+   // Advance the link info for the next element on the chain
+   chain_info->u.pLayerInfo = chain_info->u.pLayerInfo->pNext;
+
+   VkResult result = fpCreateInstance(pCreateInfo, pAllocator, pInstance);
+   if (result != VK_SUCCESS) return result;
+
+   struct instance_data *instance_data = new_instance_data(*pInstance);
+   vk_load_instance_commands(instance_data->instance,
+                             fpGetInstanceProcAddr,
+                             &instance_data->vtable);
+   instance_data_map_physical_devices(instance_data, true);
+
+   parse_overlay_env(&instance_data->params, getenv("MANGOHUD_CONFIG"));
+   
+   int font_size;
+   instance_data->params.font_size > 0 ? font_size = instance_data->params.font_size : font_size = 24;
+   
+   hudSpacing = font_size / 2;
+   hudFirstRow = font_size * 5;
+   hudSecondRow = font_size * 8;
+
+   /* If there's no control file, and an output_file was specified, start
+    * capturing fps data right away.
+    */
+   instance_data->capture_enabled =
+      instance_data->params.output_file && instance_data->params.control < 0;
+   instance_data->capture_started = instance_data->capture_enabled;
+
+   for (int i = OVERLAY_PARAM_ENABLED_vertices;
+        i <= OVERLAY_PARAM_ENABLED_compute_invocations; i++) {
+      if (instance_data->params.enabled[i]) {
+         instance_data->pipeline_statistics_enabled = true;
+         break;
+      }
+   }
+
+   return result;
+}
+
+static void overlay_DestroyInstance(
+    VkInstance                                  instance,
+    const VkAllocationCallbacks*                pAllocator)
+{
+   struct instance_data *instance_data = FIND(struct instance_data, instance);
+   instance_data_map_physical_devices(instance_data, false);
+   instance_data->vtable.DestroyInstance(instance, pAllocator);
+   destroy_instance_data(instance_data);
+}
+
+static const struct {
+   const char *name;
+   void *ptr;
+} name_to_funcptr_map[] = {
+   { "vkGetDeviceProcAddr", (void *) vkGetDeviceProcAddr },
+#define ADD_HOOK(fn) { "vk" # fn, (void *) overlay_ ## fn }
+#define ADD_ALIAS_HOOK(alias, fn) { "vk" # alias, (void *) overlay_ ## fn }
+   ADD_HOOK(AllocateCommandBuffers),
+   ADD_HOOK(FreeCommandBuffers),
+   ADD_HOOK(ResetCommandBuffer),
+   ADD_HOOK(BeginCommandBuffer),
+   ADD_HOOK(EndCommandBuffer),
+   ADD_HOOK(CmdExecuteCommands),
+
+   ADD_HOOK(CmdDraw),
+   ADD_HOOK(CmdDrawIndexed),
+   ADD_HOOK(CmdDrawIndirect),
+   ADD_HOOK(CmdDrawIndexedIndirect),
+   ADD_HOOK(CmdDispatch),
+   ADD_HOOK(CmdDispatchIndirect),
+   ADD_HOOK(CmdDrawIndirectCount),
+   ADD_ALIAS_HOOK(CmdDrawIndirectCountKHR, CmdDrawIndirectCount),
+   ADD_HOOK(CmdDrawIndexedIndirectCount),
+   ADD_ALIAS_HOOK(CmdDrawIndexedIndirectCountKHR, CmdDrawIndexedIndirectCount),
+
+   ADD_HOOK(CmdBindPipeline),
+
+   ADD_HOOK(CreateSwapchainKHR),
+   ADD_HOOK(QueuePresentKHR),
+   ADD_HOOK(DestroySwapchainKHR),
+   ADD_HOOK(AcquireNextImageKHR),
+   ADD_HOOK(AcquireNextImage2KHR),
+
+   ADD_HOOK(QueueSubmit),
+
+   ADD_HOOK(CreateDevice),
+   ADD_HOOK(DestroyDevice),
+
+   ADD_HOOK(CreateInstance),
+   ADD_HOOK(DestroyInstance),
+#undef ADD_HOOK
+};
+
+static void *find_ptr(const char *name)
+{
+   for (uint32_t i = 0; i < ARRAY_SIZE(name_to_funcptr_map); i++) {
+      if (strcmp(name, name_to_funcptr_map[i].name) == 0)
+         return name_to_funcptr_map[i].ptr;
+   }
+
+   return NULL;
+}
+
+VK_LAYER_EXPORT VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetDeviceProcAddr(VkDevice dev,
+                                                                             const char *funcName)
+{
+   void *ptr = find_ptr(funcName);
+   if (ptr) return reinterpret_cast<PFN_vkVoidFunction>(ptr);
+
+   if (dev == NULL) return NULL;
+
+   struct device_data *device_data = FIND(struct device_data, dev);
+   if (device_data->vtable.GetDeviceProcAddr == NULL) return NULL;
+   return device_data->vtable.GetDeviceProcAddr(dev, funcName);
+}
+
+VK_LAYER_EXPORT VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vkGetInstanceProcAddr(VkInstance instance,
+                                                                               const char *funcName)
+{
+   void *ptr = find_ptr(funcName);
+   if (ptr) return reinterpret_cast<PFN_vkVoidFunction>(ptr);
+
+   if (instance == NULL) return NULL;
+
+   struct instance_data *instance_data = FIND(struct instance_data, instance);
+   if (instance_data->vtable.GetInstanceProcAddr == NULL) return NULL;
+   return instance_data->vtable.GetInstanceProcAddr(instance, funcName);
+}
diff --git a/src/overlay.frag b/src/overlay.frag
new file mode 100644
index 00000000..313a8880
--- /dev/null
+++ b/src/overlay.frag
@@ -0,0 +1,14 @@
+#version 450 core
+layout(location = 0) out vec4 fColor;
+
+layout(set=0, binding=0) uniform sampler2D sTexture;
+
+layout(location = 0) in struct{
+    vec4 Color;
+    vec2 UV;
+} In;
+
+void main()
+{
+    fColor = In.Color * texture(sTexture, In.UV.st);
+}
diff --git a/src/overlay.vert b/src/overlay.vert
new file mode 100644
index 00000000..20b29082
--- /dev/null
+++ b/src/overlay.vert
@@ -0,0 +1,25 @@
+#version 450 core
+layout(location = 0) in vec2 aPos;
+layout(location = 1) in vec2 aUV;
+layout(location = 2) in vec4 aColor;
+
+layout(push_constant) uniform uPushConstant{
+    vec2 uScale;
+    vec2 uTranslate;
+} pc;
+
+out gl_PerVertex{
+    vec4 gl_Position;
+};
+
+layout(location = 0) out struct{
+    vec4 Color;
+    vec2 UV;
+} Out;
+
+void main()
+{
+    Out.Color = aColor;
+    Out.UV = aUV;
+    gl_Position = vec4(aPos*pc.uScale+pc.uTranslate, 0, 1);
+}
diff --git a/src/overlay_params.c b/src/overlay_params.c
new file mode 100644
index 00000000..95715691
--- /dev/null
+++ b/src/overlay_params.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/sysinfo.h>
+
+#include "overlay_params.h"
+
+#include "mesa/util/os_socket.h"
+
+static enum overlay_param_position
+parse_position(const char *str)
+{
+   if (!str || !strcmp(str, "top-left"))
+      return LAYER_POSITION_TOP_LEFT;
+   if (!strcmp(str, "top-right"))
+      return LAYER_POSITION_TOP_RIGHT;
+   if (!strcmp(str, "bottom-left"))
+      return LAYER_POSITION_BOTTOM_LEFT;
+   if (!strcmp(str, "bottom-right"))
+      return LAYER_POSITION_BOTTOM_RIGHT;
+   return LAYER_POSITION_TOP_LEFT;
+}
+
+static FILE *
+parse_output_file(const char *str)
+{
+   return fopen(str, "w+");
+}
+
+static int
+parse_control(const char *str)
+{
+   int ret = os_socket_listen_abstract(str, 1);
+   if (ret < 0) {
+      fprintf(stderr, "ERROR: Couldn't create socket pipe at '%s'\n", str);
+      fprintf(stderr, "ERROR: '%s'\n", strerror(errno));
+      return ret;
+   }
+
+   os_socket_block(ret, false);
+
+   return ret;
+}
+
+static float
+parse_font_size(const char *str)
+{
+   return strtof(str, NULL);
+}
+
+static uint32_t
+parse_fps_sampling_period(const char *str)
+{
+   return strtol(str, NULL, 0) * 1000;
+}
+
+static bool
+parse_no_display(const char *str)
+{
+   return strtol(str, NULL, 0) != 0;
+}
+
+static unsigned
+parse_unsigned(const char *str)
+{
+   return strtol(str, NULL, 0);
+}
+
+#define parse_width(s) parse_unsigned(s)
+#define parse_height(s) parse_unsigned(s)
+
+static bool
+parse_help(const char *str)
+{
+   fprintf(stderr, "Layer params using VK_LAYER_MESA_OVERLAY_CONFIG=\n");
+#define OVERLAY_PARAM_BOOL(name)                \
+   fprintf(stderr, "\t%s=0|1\n", #name);
+#define OVERLAY_PARAM_CUSTOM(name)
+   OVERLAY_PARAMS
+#undef OVERLAY_PARAM_BOOL
+#undef OVERLAY_PARAM_CUSTOM
+   fprintf(stderr, "\tposition=top-left|top-right|bottom-left|bottom-right\n");
+   fprintf(stderr, "\tfps_sampling_period=number-of-milliseconds\n");
+   fprintf(stderr, "\tno_display=0|1\n");
+   fprintf(stderr, "\toutput_file=/path/to/output.txt\n");
+   fprintf(stderr, "\twidth=width-in-pixels\n");
+   fprintf(stderr, "\theight=height-in-pixels\n");
+
+   return true;
+}
+
+static bool is_delimiter(char c)
+{
+   return c == 0 || c == ',' || c == ':' || c == ';' || c == '=';
+}
+
+static int
+parse_string(const char *s, char *out_param, char *out_value)
+{
+   int i = 0;
+
+   for (; !is_delimiter(*s); s++, out_param++, i++)
+      *out_param = *s;
+
+   *out_param = 0;
+
+   if (*s == '=') {
+      s++;
+      i++;
+      for (; !is_delimiter(*s); s++, out_value++, i++)
+         *out_value = *s;
+   } else
+      *(out_value++) = '1';
+   *out_value = 0;
+
+   if (*s && is_delimiter(*s)) {
+      s++;
+      i++;
+   }
+
+   if (*s && !i) {
+      fprintf(stderr, "mesa-overlay: syntax error: unexpected '%c' (%i) while "
+              "parsing a string\n", *s, *s);
+      fflush(stderr);
+   }
+
+   return i;
+}
+
+const char *overlay_param_names[] = {
+#define OVERLAY_PARAM_BOOL(name) #name,
+#define OVERLAY_PARAM_CUSTOM(name)
+   OVERLAY_PARAMS
+#undef OVERLAY_PARAM_BOOL
+#undef OVERLAY_PARAM_CUSTOM
+};
+
+void
+parse_overlay_env(struct overlay_params *params,
+                  const char *env)
+{
+   uint32_t num;
+   char key[256], value[256];
+
+   memset(params, 0, sizeof(*params));
+
+   /* Visible by default */
+   params->enabled[OVERLAY_PARAM_ENABLED_fps] = true;
+   params->enabled[OVERLAY_PARAM_ENABLED_frame_timing] = true;
+   params->enabled[OVERLAY_PARAM_ENABLED_core_load] = false;
+   params->enabled[OVERLAY_PARAM_ENABLED_cpu_temp] = false;
+   params->enabled[OVERLAY_PARAM_ENABLED_gpu_temp] = false;
+   params->fps_sampling_period = 500000; /* 500ms */
+   params->width = 280;
+   params->height = 140;
+   params->control = -1;
+
+   if (!env)
+      return;
+
+   while ((num = parse_string(env, key, value)) != 0) {
+      env += num;
+
+#define OVERLAY_PARAM_BOOL(name)                                        \
+      if (!strcmp(#name, key)) {                                        \
+         params->enabled[OVERLAY_PARAM_ENABLED_##name] =                \
+            strtol(value, NULL, 0);                                     \
+         continue;                                                      \
+      }
+#define OVERLAY_PARAM_CUSTOM(name)               \
+      if (!strcmp(#name, key)) {                 \
+         params->name = parse_##name(value);     \
+         continue;                               \
+      }
+      OVERLAY_PARAMS
+#undef OVERLAY_PARAM_BOOL
+#undef OVERLAY_PARAM_CUSTOM
+      fprintf(stderr, "Unknown option '%s'\n", key);
+   }
+   // if font_size is used and height has not been changed from default
+   // increase height as needed based on font_size
+   bool heightChanged = false;
+   
+   if (params->height != 140)
+      heightChanged = true;
+
+   int FrameTimeGraphHeight = 50;
+
+   if (!params->font_size)
+      params->font_size = 24.0f;
+
+   if (params->font_size && !heightChanged)
+      params->height = (params->font_size + 3 * 2) * 3 + FrameTimeGraphHeight;
+
+   // Apply more hud height if cores are enabled
+   if (params->enabled[OVERLAY_PARAM_ENABLED_core_load] && !heightChanged)
+     params->height += ((params->font_size - 3) * get_nprocs());
+}
diff --git a/src/overlay_params.h b/src/overlay_params.h
new file mode 100644
index 00000000..12a152c8
--- /dev/null
+++ b/src/overlay_params.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright © 2019 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef OVERLAY_PARAMS_H
+#define OVERLAY_PARAMS_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#define OVERLAY_PARAMS                               \
+   OVERLAY_PARAM_BOOL(fps)                           \
+   OVERLAY_PARAM_BOOL(frame)                         \
+   OVERLAY_PARAM_BOOL(frame_timing)                  \
+   OVERLAY_PARAM_BOOL(submit)                        \
+   OVERLAY_PARAM_BOOL(draw)                          \
+   OVERLAY_PARAM_BOOL(draw_indexed)                  \
+   OVERLAY_PARAM_BOOL(draw_indirect)                 \
+   OVERLAY_PARAM_BOOL(draw_indexed_indirect)         \
+   OVERLAY_PARAM_BOOL(draw_indirect_count)           \
+   OVERLAY_PARAM_BOOL(draw_indexed_indirect_count)   \
+   OVERLAY_PARAM_BOOL(dispatch)                      \
+   OVERLAY_PARAM_BOOL(dispatch_indirect)             \
+   OVERLAY_PARAM_BOOL(pipeline_graphics)             \
+   OVERLAY_PARAM_BOOL(pipeline_compute)              \
+   OVERLAY_PARAM_BOOL(pipeline_raytracing)           \
+   OVERLAY_PARAM_BOOL(acquire)                       \
+   OVERLAY_PARAM_BOOL(acquire_timing)                \
+   OVERLAY_PARAM_BOOL(present_timing)                \
+   OVERLAY_PARAM_BOOL(vertices)                      \
+   OVERLAY_PARAM_BOOL(primitives)                    \
+   OVERLAY_PARAM_BOOL(vert_invocations)              \
+   OVERLAY_PARAM_BOOL(geom_invocations)              \
+   OVERLAY_PARAM_BOOL(geom_primitives)               \
+   OVERLAY_PARAM_BOOL(clip_invocations)              \
+   OVERLAY_PARAM_BOOL(clip_primitives)               \
+   OVERLAY_PARAM_BOOL(frag_invocations)              \
+   OVERLAY_PARAM_BOOL(tess_ctrl_patches)             \
+   OVERLAY_PARAM_BOOL(tess_eval_invocations)         \
+   OVERLAY_PARAM_BOOL(compute_invocations)           \
+   OVERLAY_PARAM_BOOL(gpu_timing)                    \
+   OVERLAY_PARAM_BOOL(core_load)                     \
+   OVERLAY_PARAM_BOOL(cpu_temp)                      \
+   OVERLAY_PARAM_BOOL(gpu_temp)                      \
+   OVERLAY_PARAM_CUSTOM(fps_sampling_period)         \
+   OVERLAY_PARAM_CUSTOM(output_file)                 \
+   OVERLAY_PARAM_CUSTOM(position)                    \
+   OVERLAY_PARAM_CUSTOM(width)                       \
+   OVERLAY_PARAM_CUSTOM(height)                      \
+   OVERLAY_PARAM_CUSTOM(no_display)                  \
+   OVERLAY_PARAM_CUSTOM(control)                     \
+   OVERLAY_PARAM_CUSTOM(font_size)                   \
+   OVERLAY_PARAM_CUSTOM(help)
+
+enum overlay_param_position {
+   LAYER_POSITION_TOP_LEFT,
+   LAYER_POSITION_TOP_RIGHT,
+   LAYER_POSITION_BOTTOM_LEFT,
+   LAYER_POSITION_BOTTOM_RIGHT,
+};
+
+enum overlay_param_enabled {
+#define OVERLAY_PARAM_BOOL(name) OVERLAY_PARAM_ENABLED_##name,
+#define OVERLAY_PARAM_CUSTOM(name)
+   OVERLAY_PARAMS
+#undef OVERLAY_PARAM_BOOL
+#undef OVERLAY_PARAM_CUSTOM
+   OVERLAY_PARAM_ENABLED_MAX
+};
+
+struct overlay_params {
+   bool enabled[OVERLAY_PARAM_ENABLED_MAX];
+   enum overlay_param_position position;
+   FILE *output_file;
+   int control;
+   uint32_t fps_sampling_period; /* us */
+   bool help;
+   bool no_display;
+   unsigned width;
+   unsigned height;
+   float font_size;
+};
+
+const extern char *overlay_param_names[];
+
+void parse_overlay_env(struct overlay_params *params,
+                       const char *env);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* OVERLAY_PARAMS_H */
diff --git a/src/setup_mangohud.sh b/src/setup_mangohud.sh
new file mode 100755
index 00000000..9faaa17f
--- /dev/null
+++ b/src/setup_mangohud.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+MANGOHUD_DIR=$HOME/.local/share/MangoHud/
+LIB64=$HOME/.local/share/MangoHud/libMangoHud64.so
+LIB32=$HOME/.local/share/MangoHud/libMangoHud32.so
+IMPLICIT_LAYER_DIR=$HOME/.local/share/vulkan/implicit_layer.d 
+EXPLICIT_LAYER_DIR=$HOME/.local/share/vulkan/explicit_layer.d
+
+install() {
+    mkdir -p $IMPLICIT_LAYER_DIR
+    mkdir -p $EXPLICIT_LAYER_DIR
+    mkdir -p $MANGOHUD_DIR
+    cp -v x32/libMangoHud32.so $MANGOHUD_DIR     
+    cp -v x64/libMangoHud64.so $MANGOHUD_DIR
+    cp -v implicit_layer.d/mangohud32.json $IMPLICIT_LAYER_DIR
+    cp -v implicit_layer.d/mangohud64.json $IMPLICIT_LAYER_DIR
+    cp -v explicit_layer.d/mangohud32.json $EXPLICIT_LAYER_DIR
+    cp -v explicit_layer.d/mangohud64.json $EXPLICIT_LAYER_DIR
+    sed -i "s|libMangoHud.so|$LIB32|g" $IMPLICIT_LAYER_DIR/mangohud32.json
+    sed -i "s|libMangoHud.so|$LIB64|g" $IMPLICIT_LAYER_DIR/mangohud64.json
+    sed -i "s|64bit|32bit|g" $IMPLICIT_LAYER_DIR/mangohud32.json
+    sed -i "s|libMangoHud.so|$LIB32|g" $EXPLICIT_LAYER_DIR/mangohud32.json
+    sed -i "s|libMangoHud.so|$LIB64|g" $EXPLICIT_LAYER_DIR/mangohud64.json
+    sed -i "s|64bit|32bit|g" $EXPLICIT_LAYER_DIR/mangohud32.json
+    sed -i "s|mangohud|mangohud32|g" $EXPLICIT_LAYER_DIR/mangohud32.json
+}
+
+uninstall() {
+    rm -v $MANGOHUD_DIR/libMangoHud32.so
+    rm -v $MANGOHUD_DIR/libMangoHud64.so
+    rm -v $IMPLICIT_LAYER_DIR/mangohud32.json
+    rm -v $IMPLICIT_LAYER_DIR/mangohud64.json
+}
+
+case $1 in
+    "install")
+        install
+    ;;
+    "uninstall")
+        uninstall
+    ;;
+    *)
+        echo "Unrecognized action: $1"
+        echo "Usage: $0 [install|uninstall]"
+        exit 1
+    ;;
+esac
\ No newline at end of file