PKGBUILDs/community/openimageio/arm.patch

diff -urN a/src/include/thread.h b/src/include/thread.h
--- a/src/include/thread.h	2013-07-08 13:47:25.000000000 -0600
+++ b/src/include/thread.h	2013-07-13 14:30:25.709073357 -0600
@@ -323,8 +323,12 @@
 #if defined __arm__ || defined __s390__
         __asm__ __volatile__("NOP;");
 #else
+#if defined __arm__
+        __asm__ __volatile__("NOP;");
+#else
         __asm__ __volatile__("pause;");
 #endif
+#endif
     }
 #elif USE_TBB
     __TBB_Pause(delay);
@@ -447,7 +451,12 @@
 
     // Disallow copy construction by making private and unimplemented.
     atomic (atomic const &);
+
+#if defined __arm__
+} __attribute__((aligned(8)));
+#else
 };
+#endif
 
 
 #endif /* ! USE_TBB_ATOMIC */
diff -urN a/src/include/thread.h.orig b/src/include/thread.h.orig
--- a/src/include/thread.h.orig	1969-12-31 17:00:00.000000000 -0700
+++ b/src/include/thread.h.orig	2013-07-08 13:47:25.000000000 -0600
@@ -0,0 +1,708 @@
+/*
+  Copyright 2008 Larry Gritz and the other authors and contributors.
+  All Rights Reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+  * Neither the name of the software's owners nor the names of its
+    contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+  (This is the Modified BSD License)
+*/
+
+
+/////////////////////////////////////////////////////////////////////////
+/// @file   thread.h
+///
+/// @brief  Wrappers and utilities for multithreading.
+/////////////////////////////////////////////////////////////////////////
+
+
+#ifndef OPENIMAGEIO_THREAD_H
+#define OPENIMAGEIO_THREAD_H
+
+#include "version.h"
+#include "sysutil.h"
+
+
+// defining NOMINMAX to prevent problems with std::min/std::max
+// and std::numeric_limits<type>::min()/std::numeric_limits<type>::max()
+// when boost include windows.h
+#ifdef _MSC_VER
+# define WIN32_LEAN_AND_MEAN
+# define VC_EXTRALEAN
+# ifndef NOMINMAX
+#   define NOMINMAX
+# endif
+#endif
+ 
+#include <boost/version.hpp>
+#if defined(__GNUC__) && (BOOST_VERSION == 104500)
+// gcc reports errors inside some of the boost headers with boost 1.45
+// See: https://svn.boost.org/trac/boost/ticket/4818
+#pragma GCC diagnostic ignored "-Wunused-variable"
+#endif
+
+#include <boost/thread.hpp>
+#include <boost/thread/tss.hpp>
+#include <boost/version.hpp>
+
+#if defined(__GNUC__) && (BOOST_VERSION == 104500)
+// can't restore via push/pop in all versions of gcc (warning push/pop implemented for 4.6+ only)
+#pragma GCC diagnostic error "-Wunused-variable"
+#endif
+
+#ifndef USE_TBB
+#  define USE_TBB 0
+#endif
+
+// Include files we need for atomic counters.
+// Some day, we hope this is all replaced by use of std::atomic<>.
+#if USE_TBB
+#  include <tbb/atomic.h>
+#  include <tbb/spin_mutex.h>
+#  define USE_TBB_ATOMIC 1
+#  define USE_TBB_SPINLOCK 1
+#else
+#  define USE_TBB_ATOMIC 0
+#  define USE_TBB_SPINLOCK 0
+#endif
+
+
+#if defined(_MSC_VER) && !USE_TBB
+#  include <windows.h>
+#  include <winbase.h>
+#  pragma intrinsic (_InterlockedExchangeAdd)
+#  pragma intrinsic (_InterlockedCompareExchange)
+#  pragma intrinsic (_InterlockedCompareExchange64)
+#  pragma intrinsic (_ReadWriteBarrier)
+#  if defined(_WIN64)
+#    pragma intrinsic(_InterlockedExchangeAdd64)
+#  endif
+// InterlockedExchangeAdd64 is not available for XP
+#  if defined(_WIN32_WINNT) && _WIN32_WINNT <= 0x0501
+inline long long
+InterlockedExchangeAdd64 (volatile long long *Addend, long long Value)
+{
+    long long Old;
+    do {
+        Old = *Addend;
+    } while (_InterlockedCompareExchange64(Addend, Old + Value, Old) != Old);
+    return Old;
+}
+#  endif
+#endif
+
+#if defined(__GNUC__) && (defined(_GLIBCXX_ATOMIC_BUILTINS) || (__GNUC__ * 100 + __GNUC_MINOR__ >= 401))
+#if !defined(__FreeBSD__) || defined(__x86_64__)
+#define USE_GCC_ATOMICS
+#endif
+#endif
+
+OIIO_NAMESPACE_ENTER
+{
+
+/// Null mutex that can be substituted for a real one to test how much
+/// overhead is associated with a particular mutex.
+class null_mutex {
+public:
+    null_mutex () { }
+    ~null_mutex () { }
+    void lock () { }
+    void unlock () { }
+    void lock_shared () { }
+    void unlock_shared () { }
+};
+
+/// Null lock that can be substituted for a real one to test how much
+/// overhead is associated with a particular lock.
+template<typename T>
+class null_lock {
+public:
+    null_lock (T &m) { }
+};
+
+
+// Null thread-specific ptr that just wraps a single ordinary pointer
+//
+template<typename T>
+class null_thread_specific_ptr {
+public:
+    typedef void (*destructor_t)(T *);
+    null_thread_specific_ptr (destructor_t dest=NULL)
+        : m_ptr(NULL), m_dest(dest) { }
+    ~null_thread_specific_ptr () { reset (NULL); }
+    T * get () { return m_ptr; }
+    void reset (T *newptr=NULL) {
+        if (m_ptr) {
+            if (m_dest)
+                (*m_dest) (m_ptr);
+            else
+                delete m_ptr;
+        }
+        m_ptr = newptr;
+    }
+private:
+    T *m_ptr;
+    destructor_t m_dest;
+};
+
+
+#ifdef NOTHREADS
+
+// Definitions that we use for debugging to turn off all mutexes, locks,
+// and atomics in order to test the performance hit of our thread safety.
+
+// Null thread-specific ptr that just wraps a single ordinary pointer
+//
+template<typename T>
+class thread_specific_ptr {
+public:
+    typedef void (*destructor_t)(T *);
+    thread_specific_ptr (destructor_t dest=NULL)
+        : m_ptr(NULL), m_dest(dest) { }
+    ~thread_specific_ptr () { reset (NULL); }
+    T * get () { return m_ptr; }
+    void reset (T *newptr=NULL) {
+        if (m_ptr) {
+            if (m_dest)
+                (*m_dest) (m_ptr);
+            else
+                delete m_ptr;
+        }
+        m_ptr = newptr;
+    }
+private:
+    T *m_ptr;
+    destructor_t m_dest;
+};
+
+
+typedef null_mutex mutex;
+typedef null_mutex recursive_mutex;
+typedef null_lock<mutex> lock_guard;
+typedef null_lock<recursive_mutex> recursive_lock_guard;
+
+
+#else
+
+// Fairly modern Boost has all the mutex and lock types we need.
+
+typedef boost::mutex mutex;
+typedef boost::recursive_mutex recursive_mutex;
+typedef boost::lock_guard< boost::mutex > lock_guard;
+typedef boost::lock_guard< boost::recursive_mutex > recursive_lock_guard;
+using boost::thread_specific_ptr;
+
+#endif
+
+
+
+/// Atomic version of:  r = *at, *at += x, return r
+/// For each of several architectures.
+inline int
+atomic_exchange_and_add (volatile int *at, int x)
+{
+#ifdef USE_GCC_ATOMICS
+    return __sync_fetch_and_add ((int *)at, x);
+#elif USE_TBB
+    atomic<int> *a = (atomic<int> *)at;
+    return a->fetch_and_add (x);
+#elif defined(_MSC_VER)
+    // Windows
+    return _InterlockedExchangeAdd ((volatile LONG *)at, x);
+#else
+#   error No atomics on this platform.
+#endif
+}
+
+
+
+inline long long
+atomic_exchange_and_add (volatile long long *at, long long x)
+{
+#ifdef USE_GCC_ATOMICS
+    return __sync_fetch_and_add (at, x);
+#elif USE_TBB
+    atomic<long long> *a = (atomic<long long> *)at;
+    return a->fetch_and_add (x);
+#elif defined(_MSC_VER)
+    // Windows
+#  if defined(_WIN64)
+    return _InterlockedExchangeAdd64 ((volatile LONGLONG *)at, x);
+#  else
+    return InterlockedExchangeAdd64 ((volatile LONGLONG *)at, x);
+#  endif
+#else
+#   error No atomics on this platform.
+#endif
+}
+
+
+
+/// Atomic version of:
+///    if (*at == compareval) {
+///        *at = newval;  return true;
+///    } else {
+///        return false;
+///
+inline bool
+atomic_compare_and_exchange (volatile int *at, int compareval, int newval)
+{
+#ifdef USE_GCC_ATOMICS
+    return __sync_bool_compare_and_swap (at, compareval, newval);
+#elif USE_TBB
+    atomic<int> *a = (atomic<int> *)at;
+    return a->compare_and_swap (newval, compareval) == newval;
+#elif defined(_MSC_VER)
+    return (_InterlockedCompareExchange ((volatile LONG *)at, newval, compareval) == compareval);
+#else
+#   error No atomics on this platform.
+#endif
+}
+
+
+
+inline bool
+atomic_compare_and_exchange (volatile long long *at, long long compareval, long long newval)
+{
+#ifdef USE_GCC_ATOMICS
+    return __sync_bool_compare_and_swap (at, compareval, newval);
+#elif USE_TBB
+    atomic<long long> *a = (atomic<long long> *)at;
+    return a->compare_and_swap (newval, compareval) == newval;
+#elif defined(_MSC_VER)
+    return (_InterlockedCompareExchange64 ((volatile LONGLONG *)at, newval, compareval) == compareval);
+#else
+#   error No atomics on this platform.
+#endif
+}
+
+
+
+/// Yield the processor for the rest of the timeslice.
+///
+inline void
+yield ()
+{
+#if defined(__GNUC__)
+    sched_yield ();
+#elif defined(_MSC_VER)
+    SwitchToThread ();
+#else
+#   error No yield on this platform.
+#endif
+}
+
+
+
+// Slight pause
+inline void
+pause (int delay)
+{
+#if defined(__GNUC__)
+    for (int i = 0; i < delay; ++i) {
+#if defined __arm__ || defined __s390__
+        __asm__ __volatile__("NOP;");
+#else
+        __asm__ __volatile__("pause;");
+#endif
+    }
+#elif USE_TBB
+    __TBB_Pause(delay);
+#elif defined(_MSC_VER)
+    for (int i = 0; i < delay; ++i) {
+#if defined (_WIN64)
+        YieldProcessor();
+#else
+        _asm  pause
+#endif /* _WIN64 */
+    }
+#else
+    // No pause on this platform, just punt
+    for (int i = 0; i < delay; ++i) ;
+#endif
+}
+
+
+
+// Helper class to deliver ever longer pauses until we yield our timeslice.
+class atomic_backoff {
+public:
+    atomic_backoff () : m_count(1) { }
+
+    void operator() () {
+        if (m_count <= 16) {
+            pause (m_count);
+            m_count *= 2;
+        } else {
+            yield();
+        }
+    }
+
+private:
+    int m_count;
+};
+
+
+
+#if USE_TBB_ATOMIC
+using tbb::atomic;
+#else
+// If we're not using TBB's atomic, we need to define our own atomic<>.
+
+
+/// Atomic integer.  Increment, decrement, add, and subtract in a
+/// totally thread-safe manner.
+template<class T>
+class atomic {
+public:
+    /// Construct with initial value.
+    ///
+    atomic (T val=0) : m_val(val) { }
+
+    ~atomic () { }
+
+    /// Retrieve value
+    ///
+    T operator() () const { return atomic_exchange_and_add (&m_val, 0); }
+
+    /// Retrieve value
+    ///
+    operator T() const { return atomic_exchange_and_add (&m_val, 0); }
+
+    /// Fast retrieval of value, no interchange, don't care about memory
+    /// fences.
+    T fast_value () const { return m_val; }
+
+    /// Assign new value.
+    ///
+    T operator= (T x) {
+        //incorrect? return (m_val = x);
+        while (1) {
+            T result = m_val;
+            if (atomic_compare_and_exchange (&m_val, result, x))
+                break;
+        }
+        return x;
+    }
+
+    /// Pre-increment:  ++foo
+    ///
+    T operator++ () { return atomic_exchange_and_add (&m_val, 1) + 1; }
+
+    /// Post-increment:  foo++
+    ///
+    T operator++ (int) {  return atomic_exchange_and_add (&m_val, 1); }
+
+    /// Pre-decrement:  --foo
+    ///
+    T operator-- () {  return atomic_exchange_and_add (&m_val, -1) - 1; }
+
+    /// Post-decrement:  foo--
+    ///
+    T operator-- (int) {  return atomic_exchange_and_add (&m_val, -1); }
+
+    /// Add to the value, return the new result
+    ///
+    T operator+= (T x) { return atomic_exchange_and_add (&m_val, x) + x; }
+
+    /// Subtract from the value, return the new result
+    ///
+    T operator-= (T x) { return atomic_exchange_and_add (&m_val, -x) - x; }
+
+    bool bool_compare_and_swap (T compareval, T newval) {
+        return atomic_compare_and_exchange (&m_val, compareval, newval);
+    }
+
+    T operator= (const atomic &x) {
+        T r = x();
+        *this = r;
+        return r;
+    }
+
+private:
+#ifdef __arm__
+    OIIO_ALIGN(8)
+#endif 
+    volatile mutable T m_val;
+
+    // Disallow copy construction by making private and unimplemented.
+    atomic (atomic const &);
+};
+
+
+#endif /* ! USE_TBB_ATOMIC */
+
+
+#ifdef NOTHREADS
+
+typedef int atomic_int;
+typedef long long atomic_ll;
+
+#else
+
+typedef atomic<int> atomic_int;
+typedef atomic<long long> atomic_ll;
+
+#endif
+
+
+
+#ifdef NOTHREADS
+
+typedef null_mutex spin_mutex;
+typedef null_lock<spin_mutex> spin_lock;
+
+#elif USE_TBB_SPINLOCK
+
+// Use TBB's spin locks
+typedef tbb::spin_mutex spin_mutex;
+typedef tbb::spin_mutex::scoped_lock spin_lock;
+
+
+#else
+
+// Define our own spin locks.  Do we trust them?
+
+
+/// A spin_mutex is semantically equivalent to a regular mutex, except
+/// for the following:
+///  - A spin_mutex is just 4 bytes, whereas a regular mutex is quite
+///    large (44 bytes for pthread).
+///  - A spin_mutex is extremely fast to lock and unlock, whereas a regular
+///    mutex is surprisingly expensive just to acquire a lock.
+///  - A spin_mutex takes CPU while it waits, so this can be very
+///    wasteful compared to a regular mutex that blocks (gives up its
+///    CPU slices until it acquires the lock).
+///
+/// The bottom line is that mutex is the usual choice, but in cases where
+/// you need to acquire locks very frequently, but only need to hold the
+/// lock for a very short period of time, you may save runtime by using
+/// a spin_mutex, even though it's non-blocking.
+///
+/// N.B. A spin_mutex is only the size of an int.  To avoid "false
+/// sharing", be careful not to put two spin_mutex objects on the same
+/// cache line (within 128 bytes of each other), or the two mutexes may
+/// effectively (and wastefully) lock against each other.
+///
+class spin_mutex {
+public:
+    /// Default constructor -- initialize to unlocked.
+    ///
+    spin_mutex (void) { m_locked = 0; }
+
+    ~spin_mutex (void) { }
+
+    /// Copy constructor -- initialize to unlocked.
+    ///
+    spin_mutex (const spin_mutex &) { m_locked = 0; }
+
+    /// Assignment does not do anything, since lockedness should not
+    /// transfer.
+    const spin_mutex& operator= (const spin_mutex&) { return *this; }
+
+    /// Acquire the lock, spin until we have it.
+    ///
+    void lock () {
+        // To avoid spinning too tightly, we use the atomic_backoff to
+        // provide increasingly longer pauses, and if the lock is under
+        // lots of contention, eventually yield the timeslice.
+        atomic_backoff backoff;
+
+        // Try to get ownership of the lock. Though experimentation, we
+        // found that OIIO_UNLIKELY makes this just a bit faster on 
+        // gcc x86/x86_64 systems.
+        while (! OIIO_UNLIKELY(try_lock())) {
+            do {
+                backoff();
+            } while (m_locked);
+
+            // The full try_lock() involves a compare_and_swap, which
+            // writes memory, and that will lock the bus.  But a normal
+            // read of m_locked will let us spin until the value
+            // changes, without locking the bus. So it's faster to
+            // check in this manner until the mutex appears to be free.
+        }
+    }
+
+    /// Release the lock that we hold.
+    ///
+    void unlock () {
+#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+        // Fastest way to do it is with a store with "release" semantics
+        __asm__ __volatile__("": : :"memory");
+        m_locked = 0;
+        // N.B. GCC gives us an intrinsic that is even better, an atomic
+        // assignment of 0 with "release" barrier semantics:
+        //  __sync_lock_release (&m_locked);
+        // But empirically we found it not as performant as the above.
+#elif defined(_MSC_VER)
+        _ReadWriteBarrier();
+        m_locked = 0;
+#else
+        // Otherwise, just assign zero to the atomic (but that's a full 
+        // memory barrier).
+        *(atomic_int *)&m_locked = 0;
+#endif
+    }
+
+    /// Try to acquire the lock.  Return true if we have it, false if
+    /// somebody else is holding the lock.
+    bool try_lock () {
+#if USE_TBB_ATOMIC
+        // TBB's compare_and_swap returns the original value
+        return (*(atomic_int *)&m_locked).compare_and_swap (0, 1) == 0;
+#elif defined(__GNUC__)
+        // GCC gives us an intrinsic that is even better -- an atomic
+        // exchange with "acquire" barrier semantics.
+        return __sync_lock_test_and_set (&m_locked, 1) == 0;
+#else
+        // Our compare_and_swap returns true if it swapped
+        return atomic_compare_and_exchange (&m_locked, 0, 1);
+#endif
+    }
+
+    /// Helper class: scoped lock for a spin_mutex -- grabs the lock upon
+    /// construction, releases the lock when it exits scope.
+    class lock_guard {
+    public:
+        lock_guard (spin_mutex &fm) : m_fm(fm) { m_fm.lock(); }
+        ~lock_guard () { m_fm.unlock(); }
+    private:
+        lock_guard(); // Do not implement (even though TBB does)
+        lock_guard(const lock_guard& other); // Do not implement
+        lock_guard& operator = (const lock_guard& other); // Do not implement
+        spin_mutex & m_fm;
+    };
+
+private:
+    volatile int m_locked;  ///< Atomic counter is zero if nobody holds the lock
+};
+
+
+typedef spin_mutex::lock_guard spin_lock;
+
+#endif
+
+
+
+/// Spinning reader/writer mutex.  This is just like spin_mutex, except
+/// that there are separate locking mechanisms for "writers" (exclusive
+/// holders of the lock, presumably because they are modifying whatever
+/// the lock is protecting) and "readers" (non-exclusive, non-modifying
+/// tasks that may access the protectee simultaneously).
+class spin_rw_mutex {
+public:
+    /// Default constructor -- initialize to unlocked.
+    ///
+    spin_rw_mutex (void) { m_readers = 0; }
+
+    ~spin_rw_mutex (void) { }
+
+    /// Copy constructor -- initialize to unlocked.
+    ///
+    spin_rw_mutex (const spin_rw_mutex &) { m_readers = 0; }
+
+    /// Assignment does not do anything, since lockedness should not
+    /// transfer.
+    const spin_rw_mutex& operator= (const spin_rw_mutex&) { return *this; }
+
+    /// Acquire the reader lock.
+    ///
+    void read_lock () {
+        // Spin until there are no writers active
+        m_locked.lock();
+        // Register ourself as a reader
+        ++m_readers;
+        // Release the lock, to let other readers work
+        m_locked.unlock();
+    }
+
+    /// Release the reader lock.
+    ///
+    void read_unlock () {
+        --m_readers;  // it's atomic, no need to lock to release
+    }
+
+    /// Acquire the writer lock.
+    ///
+    void write_lock () {
+        // Make sure no new readers (or writers) can start
+        m_locked.lock();
+        // Spin until the last reader is done, at which point we will be
+        // the sole owners and nobody else (reader or writer) can acquire
+        // the resource until we release it.
+        while (*(volatile int *)&m_readers > 0)
+                ;
+    }
+
+    /// Release the writer lock.
+    ///
+    void write_unlock () {
+        // Let other readers or writers get the lock
+        m_locked.unlock ();
+    }
+
+    /// Helper class: scoped read lock for a spin_rw_mutex -- grabs the
+    /// read lock upon construction, releases the lock when it exits scope.
+    class read_lock_guard {
+    public:
+        read_lock_guard (spin_rw_mutex &fm) : m_fm(fm) { m_fm.read_lock(); }
+        ~read_lock_guard () { m_fm.read_unlock(); }
+    private:
+        read_lock_guard(); // Do not implement
+        read_lock_guard(const read_lock_guard& other); // Do not implement
+        read_lock_guard& operator = (const read_lock_guard& other); // Do not implement
+        spin_rw_mutex & m_fm;
+    };
+
+    /// Helper class: scoped write lock for a spin_rw_mutex -- grabs the
+    /// read lock upon construction, releases the lock when it exits scope.
+    class write_lock_guard {
+    public:
+        write_lock_guard (spin_rw_mutex &fm) : m_fm(fm) { m_fm.write_lock(); }
+        ~write_lock_guard () { m_fm.write_unlock(); }
+    private:
+        write_lock_guard(); // Do not implement
+        write_lock_guard(const write_lock_guard& other); // Do not implement
+        write_lock_guard& operator = (const write_lock_guard& other); // Do not implement
+        spin_rw_mutex & m_fm;
+    };
+
+private:
+    OIIO_CACHE_ALIGN
+    spin_mutex m_locked;   // write lock
+    char pad1_[OIIO_CACHE_LINE_SIZE-sizeof(spin_mutex)];
+    OIIO_CACHE_ALIGN
+    atomic_int m_readers;  // number of readers
+    char pad2_[OIIO_CACHE_LINE_SIZE-sizeof(atomic_int)];
+};
+
+
+typedef spin_rw_mutex::read_lock_guard spin_rw_read_lock;
+typedef spin_rw_mutex::write_lock_guard spin_rw_write_lock;
+
+
+}
+OIIO_NAMESPACE_EXIT
+
+#endif // OPENIMAGEIO_THREAD_H