view openmp/tools/archer/ompt-tsan.cpp @ 171:66f3bfe93da9

git version 2c4ca6832fa6b306ee6a7010bfb80a3f2596f824
author Shinji KONO <kono@ie.u-ryukyu.ac.jp>
date Mon, 25 May 2020 11:07:02 +0900
parents 1d019706d866
children 0572611fdcc8
line wrap: on
line source

/*
 * ompt-tsan.cpp -- Archer runtime library, TSan annotations for Archer
 */
  
  //===----------------------------------------------------------------------===//
  //
  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  // See https://llvm.org/LICENSE.txt for details.
  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  //
  //===----------------------------------------------------------------------===//


#ifndef __STDC_FORMAT_MACROS
#define __STDC_FORMAT_MACROS
#endif

#include <atomic>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <inttypes.h>
#include <iostream>
#include <mutex>
#include <sstream>
#include <stack>
#include <list>
#include <string>
#include <iostream>
#include <unordered_map>
#include <vector>

#if (defined __APPLE__ && defined __MACH__)
#include <dlfcn.h>
#endif

#include <sys/resource.h>
#include "omp-tools.h"

static int runOnTsan;
static int hasReductionCallback;

class ArcherFlags {
public:
#if (LLVM_VERSION) >= 40
  int flush_shadow;
#endif
  int print_max_rss;
  int verbose;
  int enabled;

  ArcherFlags(const char *env)
      :
#if (LLVM_VERSION) >= 40
        flush_shadow(0),
#endif
        print_max_rss(0), verbose(0), enabled(1) {
    if (env) {
      std::vector<std::string> tokens;
      std::string token;
      std::string str(env);
      std::istringstream iss(str);
      while (std::getline(iss, token, ' '))
        tokens.push_back(token);

      for (std::vector<std::string>::iterator it = tokens.begin();
           it != tokens.end(); ++it) {
#if (LLVM_VERSION) >= 40
        if (sscanf(it->c_str(), "flush_shadow=%d", &flush_shadow))
          continue;
#endif
        if (sscanf(it->c_str(), "print_max_rss=%d", &print_max_rss))
          continue;
        if (sscanf(it->c_str(), "verbose=%d", &verbose))
          continue;
        if (sscanf(it->c_str(), "enable=%d", &enabled))
          continue;
        std::cerr << "Illegal values for ARCHER_OPTIONS variable: " << token
                  << std::endl;
      }
    }
  }
};

class TsanFlags {
public:
  int ignore_noninstrumented_modules;

  TsanFlags(const char *env) : ignore_noninstrumented_modules(0) {
    if (env) {
      std::vector<std::string> tokens;
      std::string token;
      std::string str(env);
      std::istringstream iss(str);
      while (std::getline(iss, token, ' '))
        tokens.push_back(token);

      for (std::vector<std::string>::iterator it = tokens.begin();
           it != tokens.end(); ++it) {
        // we are interested in ignore_noninstrumented_modules to print a
        // warning
        if (sscanf(it->c_str(), "ignore_noninstrumented_modules=%d",
                   &ignore_noninstrumented_modules))
          continue;
      }
    }
  }
};

#if (LLVM_VERSION) >= 40
extern "C" {
int __attribute__((weak)) __archer_get_omp_status();
void __attribute__((weak)) __tsan_flush_memory() {}
}
#endif
ArcherFlags *archer_flags;

// The following definitions are pasted from "llvm/Support/Compiler.h" to allow
// the code
// to be compiled with other compilers like gcc:

#ifndef TsanHappensBefore
// Thread Sanitizer is a tool that finds races in code.
// See http://code.google.com/p/data-race-test/wiki/DynamicAnnotations .
// tsan detects these exact functions by name.
extern "C" {
#if (defined __APPLE__ && defined __MACH__)
static void AnnotateHappensAfter(const char *file, int line,
                                 const volatile void *cv) {
  void (*fptr)(const char *, int, const volatile void *);

  fptr = (void (*)(const char *, int, const volatile void *))dlsym(
      RTLD_DEFAULT, "AnnotateHappensAfter");
  (*fptr)(file, line, cv);
}
static void AnnotateHappensBefore(const char *file, int line,
                                  const volatile void *cv) {
  void (*fptr)(const char *, int, const volatile void *);

  fptr = (void (*)(const char *, int, const volatile void *))dlsym(
      RTLD_DEFAULT, "AnnotateHappensBefore");
  (*fptr)(file, line, cv);
}
static void AnnotateIgnoreWritesBegin(const char *file, int line) {
  void (*fptr)(const char *, int);

  fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
                                            "AnnotateIgnoreWritesBegin");
  (*fptr)(file, line);
}
static void AnnotateIgnoreWritesEnd(const char *file, int line) {
  void (*fptr)(const char *, int);

  fptr = (void (*)(const char *, int))dlsym(RTLD_DEFAULT,
                                            "AnnotateIgnoreWritesEnd");
  (*fptr)(file, line);
}
static void AnnotateNewMemory(const char *file, int line,
                              const volatile void *cv, size_t size) {
  void (*fptr)(const char *, int, const volatile void *, size_t);

  fptr = (void (*)(const char *, int, const volatile void *, size_t))dlsym(
      RTLD_DEFAULT, "AnnotateNewMemory");
  (*fptr)(file, line, cv, size);
}
static int RunningOnValgrind() {
  int (*fptr)();

  fptr = (int (*)())dlsym(RTLD_DEFAULT, "RunningOnValgrind");
  if (fptr && fptr != RunningOnValgrind)
    runOnTsan = 0;
  return 0;
}
#else
void __attribute__((weak))
AnnotateHappensAfter(const char *file, int line, const volatile void *cv) {}
void __attribute__((weak))
AnnotateHappensBefore(const char *file, int line, const volatile void *cv) {}
void __attribute__((weak))
AnnotateIgnoreWritesBegin(const char *file, int line) {}
void __attribute__((weak)) AnnotateIgnoreWritesEnd(const char *file, int line) {
}
void __attribute__((weak))
AnnotateNewMemory(const char *file, int line, const volatile void *cv,
                  size_t size) {}
int __attribute__((weak)) RunningOnValgrind() {
  runOnTsan = 0;
  return 0;
}
void __attribute__((weak)) __tsan_func_entry(const void *call_pc) {}
void __attribute__((weak)) __tsan_func_exit(void) {}
#endif
}

// This marker is used to define a happens-before arc. The race detector will
// infer an arc from the begin to the end when they share the same pointer
// argument.
#define TsanHappensBefore(cv) AnnotateHappensBefore(__FILE__, __LINE__, cv)

// This marker defines the destination of a happens-before arc.
#define TsanHappensAfter(cv) AnnotateHappensAfter(__FILE__, __LINE__, cv)

// Ignore any races on writes between here and the next TsanIgnoreWritesEnd.
#define TsanIgnoreWritesBegin() AnnotateIgnoreWritesBegin(__FILE__, __LINE__)

// Resume checking for racy writes.
#define TsanIgnoreWritesEnd() AnnotateIgnoreWritesEnd(__FILE__, __LINE__)

// We don't really delete the clock for now
#define TsanDeleteClock(cv)

// newMemory
#define TsanNewMemory(addr, size)                                              \
  AnnotateNewMemory(__FILE__, __LINE__, addr, size)
#define TsanFreeMemory(addr, size)                                             \
  AnnotateNewMemory(__FILE__, __LINE__, addr, size)
#endif

// Function entry/exit
#define TsanFuncEntry(pc) __tsan_func_entry(pc)
#define TsanFuncExit() __tsan_func_exit()

/// Required OMPT inquiry functions.
static ompt_get_parallel_info_t ompt_get_parallel_info;
static ompt_get_thread_data_t ompt_get_thread_data;

typedef uint64_t ompt_tsan_clockid;

static uint64_t my_next_id() {
  static uint64_t ID = 0;
  uint64_t ret = __sync_fetch_and_add(&ID, 1);
  return ret;
}

// Data structure to provide a threadsafe pool of reusable objects.
// DataPool<Type of objects, Size of blockalloc>
template <typename T, int N> struct DataPool {
  std::mutex DPMutex;
  std::stack<T *> DataPointer;
  std::list<void *> memory;
  int total;

  void newDatas() {
    // prefix the Data with a pointer to 'this', allows to return memory to
    // 'this',
    // without explicitly knowing the source.
    //
    // To reduce lock contention, we use thread local DataPools, but Data
    // objects move to other threads.
    // The strategy is to get objects from local pool. Only if the object moved
    // to another
    // thread, we might see a penalty on release (returnData).
    // For "single producer" pattern, a single thread creates tasks, these are
    // executed by other threads.
    // The master will have a high demand on TaskData, so return after use.
    struct pooldata {
      DataPool<T, N> *dp;
      T data;
    };
    // We alloc without initialize the memory. We cannot call constructors.
    // Therfore use malloc!
    pooldata *datas = (pooldata *)malloc(sizeof(pooldata) * N);
    memory.push_back(datas);
    for (int i = 0; i < N; i++) {
      datas[i].dp = this;
      DataPointer.push(&(datas[i].data));
    }
    total += N;
  }

  T *getData() {
    T *ret;
    DPMutex.lock();
    if (DataPointer.empty())
      newDatas();
    ret = DataPointer.top();
    DataPointer.pop();
    DPMutex.unlock();
    return ret;
  }

  void returnData(T *data) {
    DPMutex.lock();
    DataPointer.push(data);
    DPMutex.unlock();
  }

  void getDatas(int n, T **datas) {
    DPMutex.lock();
    for (int i = 0; i < n; i++) {
      if (DataPointer.empty())
        newDatas();
      datas[i] = DataPointer.top();
      DataPointer.pop();
    }
    DPMutex.unlock();
  }

  void returnDatas(int n, T **datas) {
    DPMutex.lock();
    for (int i = 0; i < n; i++) {
      DataPointer.push(datas[i]);
    }
    DPMutex.unlock();
  }

  DataPool() : DPMutex(), DataPointer(), total(0) {}

  ~DataPool() {
    // we assume all memory is returned when the thread finished / destructor is
    // called
    for (auto i : memory)
      if (i)
        free(i);
  }
};

// This function takes care to return the data to the originating DataPool
// A pointer to the originating DataPool is stored just before the actual data.
template <typename T, int N> static void retData(void *data) {
  ((DataPool<T, N> **)data)[-1]->returnData((T *)data);
}

struct ParallelData;
__thread DataPool<ParallelData, 4> *pdp;

/// Data structure to store additional information for parallel regions.
struct ParallelData {

  // Parallel fork is just another barrier, use Barrier[1]

  /// Two addresses for relationships with barriers.
  ompt_tsan_clockid Barrier[2];

  const void *codePtr;

  void *GetParallelPtr() { return &(Barrier[1]); }

  void *GetBarrierPtr(unsigned Index) { return &(Barrier[Index]); }

  ParallelData(const void *codeptr) : codePtr(codeptr) {}
  ~ParallelData() {
    TsanDeleteClock(&(Barrier[0]));
    TsanDeleteClock(&(Barrier[1]));
  }
  // overload new/delete to use DataPool for memory management.
  void *operator new(size_t size) { return pdp->getData(); }
  void operator delete(void *p, size_t) { retData<ParallelData, 4>(p); }
};

static inline ParallelData *ToParallelData(ompt_data_t *parallel_data) {
  return reinterpret_cast<ParallelData *>(parallel_data->ptr);
}

struct Taskgroup;
__thread DataPool<Taskgroup, 4> *tgp;

/// Data structure to support stacking of taskgroups and allow synchronization.
struct Taskgroup {
  /// Its address is used for relationships of the taskgroup's task set.
  ompt_tsan_clockid Ptr;

  /// Reference to the parent taskgroup.
  Taskgroup *Parent;

  Taskgroup(Taskgroup *Parent) : Parent(Parent) {}
  ~Taskgroup() { TsanDeleteClock(&Ptr); }

  void *GetPtr() { return &Ptr; }
  // overload new/delete to use DataPool for memory management.
  void *operator new(size_t size) { return tgp->getData(); }
  void operator delete(void *p, size_t) { retData<Taskgroup, 4>(p); }
};

struct TaskData;
__thread DataPool<TaskData, 4> *tdp;

/// Data structure to store additional information for tasks.
struct TaskData {
  /// Its address is used for relationships of this task.
  ompt_tsan_clockid Task;

  /// Child tasks use its address to declare a relationship to a taskwait in
  /// this task.
  ompt_tsan_clockid Taskwait;

  /// Whether this task is currently executing a barrier.
  bool InBarrier;

  /// Whether this task is an included task.
  bool Included;

  /// Index of which barrier to use next.
  char BarrierIndex;

  /// Count how often this structure has been put into child tasks + 1.
  std::atomic_int RefCount;

  /// Reference to the parent that created this task.
  TaskData *Parent;

  /// Reference to the implicit task in the stack above this task.
  TaskData *ImplicitTask;

  /// Reference to the team of this task.
  ParallelData *Team;

  /// Reference to the current taskgroup that this task either belongs to or
  /// that it just created.
  Taskgroup *TaskGroup;

  /// Dependency information for this task.
  ompt_dependence_t *Dependencies;

  /// Number of dependency entries.
  unsigned DependencyCount;

  void *PrivateData;
  size_t PrivateDataSize;

  int execution;
  int freed;

  TaskData(TaskData *Parent)
      : InBarrier(false), Included(false), BarrierIndex(0), RefCount(1),
        Parent(Parent), ImplicitTask(nullptr), Team(Parent->Team),
        TaskGroup(nullptr), DependencyCount(0), execution(0), freed(0) {
    if (Parent != nullptr) {
      Parent->RefCount++;
      // Copy over pointer to taskgroup. This task may set up its own stack
      // but for now belongs to its parent's taskgroup.
      TaskGroup = Parent->TaskGroup;
    }
  }

  TaskData(ParallelData *Team = nullptr)
      : InBarrier(false), Included(false), BarrierIndex(0), RefCount(1),
        Parent(nullptr), ImplicitTask(this), Team(Team), TaskGroup(nullptr),
        DependencyCount(0), execution(1), freed(0) {}

  ~TaskData() {
    TsanDeleteClock(&Task);
    TsanDeleteClock(&Taskwait);
  }

  void *GetTaskPtr() { return &Task; }

  void *GetTaskwaitPtr() { return &Taskwait; }
  // overload new/delete to use DataPool for memory management.
  void *operator new(size_t size) { return tdp->getData(); }
  void operator delete(void *p, size_t) { retData<TaskData, 4>(p); }
};

static inline TaskData *ToTaskData(ompt_data_t *task_data) {
  return reinterpret_cast<TaskData *>(task_data->ptr);
}

static inline void *ToInAddr(void *OutAddr) {
  // FIXME: This will give false negatives when a second variable lays directly
  //        behind a variable that only has a width of 1 byte.
  //        Another approach would be to "negate" the address or to flip the
  //        first bit...
  return reinterpret_cast<char *>(OutAddr) + 1;
}

/// Store a mutex for each wait_id to resolve race condition with callbacks.
std::unordered_map<ompt_wait_id_t, std::mutex> Locks;
std::mutex LocksMutex;

static void ompt_tsan_thread_begin(ompt_thread_t thread_type,
                                   ompt_data_t *thread_data) {
  pdp = new DataPool<ParallelData, 4>;
  TsanNewMemory(pdp, sizeof(pdp));
  tgp = new DataPool<Taskgroup, 4>;
  TsanNewMemory(tgp, sizeof(tgp));
  tdp = new DataPool<TaskData, 4>;
  TsanNewMemory(tdp, sizeof(tdp));
  thread_data->value = my_next_id();
}

static void ompt_tsan_thread_end(ompt_data_t *thread_data) {
  delete pdp;
  delete tgp;
  delete tdp;
}

/// OMPT event callbacks for handling parallel regions.

static void ompt_tsan_parallel_begin(ompt_data_t *parent_task_data,
                                     const ompt_frame_t *parent_task_frame,
                                     ompt_data_t *parallel_data,
                                     uint32_t requested_team_size,
                                     int flag,
                                     const void *codeptr_ra) {
  ParallelData *Data = new ParallelData(codeptr_ra);
  parallel_data->ptr = Data;

  TsanHappensBefore(Data->GetParallelPtr());
}

static void ompt_tsan_parallel_end(ompt_data_t *parallel_data,
                                   ompt_data_t *task_data,
                                   int flag,
                                   const void *codeptr_ra) {
  ParallelData *Data = ToParallelData(parallel_data);
  TsanHappensAfter(Data->GetBarrierPtr(0));
  TsanHappensAfter(Data->GetBarrierPtr(1));

  delete Data;

#if (LLVM_VERSION >= 40)
  if (&__archer_get_omp_status) {
    if (__archer_get_omp_status() == 0 && archer_flags->flush_shadow)
      __tsan_flush_memory();
  }
#endif

}

static void ompt_tsan_implicit_task(ompt_scope_endpoint_t endpoint,
                                    ompt_data_t *parallel_data,
                                    ompt_data_t *task_data,
                                    unsigned int team_size,
                                    unsigned int thread_num,
                                    int type) {
  switch (endpoint) {
  case ompt_scope_begin:
    if (type & ompt_task_initial) {
      parallel_data->ptr = new ParallelData(nullptr);
    }
    task_data->ptr = new TaskData(ToParallelData(parallel_data));
    TsanHappensAfter(ToParallelData(parallel_data)->GetParallelPtr());
    TsanFuncEntry(ToParallelData(parallel_data)->codePtr);
    break;
  case ompt_scope_end:
    TaskData *Data = ToTaskData(task_data);
    assert(Data->freed == 0 && "Implicit task end should only be called once!");
    Data->freed = 1;
    assert(Data->RefCount == 1 &&
           "All tasks should have finished at the implicit barrier!");
    delete Data;
    TsanFuncExit();
    break;
  }
}

static void ompt_tsan_sync_region(ompt_sync_region_t kind,
                                  ompt_scope_endpoint_t endpoint,
                                  ompt_data_t *parallel_data,
                                  ompt_data_t *task_data,
                                  const void *codeptr_ra) {
  TaskData *Data = ToTaskData(task_data);
  switch (endpoint) {
  case ompt_scope_begin:
    TsanFuncEntry(codeptr_ra);
    switch (kind) {
      case ompt_sync_region_barrier_implementation:
      case ompt_sync_region_barrier_implicit:
      case ompt_sync_region_barrier_explicit:
      case ompt_sync_region_barrier: {
        char BarrierIndex = Data->BarrierIndex;
        TsanHappensBefore(Data->Team->GetBarrierPtr(BarrierIndex));

        if (hasReductionCallback < ompt_set_always) {
          // We ignore writes inside the barrier. These would either occur during
          // 1. reductions performed by the runtime which are guaranteed to be
          // race-free.
          // 2. execution of another task.
          // For the latter case we will re-enable tracking in task_switch.
          Data->InBarrier = true;
          TsanIgnoreWritesBegin();
        }

        break;
      }

      case ompt_sync_region_taskwait:
        break;

      case ompt_sync_region_taskgroup:
        Data->TaskGroup = new Taskgroup(Data->TaskGroup);
        break;

      default:
        break;
    }
    break;
  case ompt_scope_end:
    TsanFuncExit();
    switch (kind) {
      case ompt_sync_region_barrier_implementation:
      case ompt_sync_region_barrier_implicit:
      case ompt_sync_region_barrier_explicit:
      case ompt_sync_region_barrier: {
        if (hasReductionCallback < ompt_set_always) {
          // We want to track writes after the barrier again.
          Data->InBarrier = false;
          TsanIgnoreWritesEnd();
        }

        char BarrierIndex = Data->BarrierIndex;
        // Barrier will end after it has been entered by all threads.
        if (parallel_data)
          TsanHappensAfter(Data->Team->GetBarrierPtr(BarrierIndex));

        // It is not guaranteed that all threads have exited this barrier before
        // we enter the next one. So we will use a different address.
        // We are however guaranteed that this current barrier is finished
        // by the time we exit the next one. So we can then reuse the first
        // address.
        Data->BarrierIndex = (BarrierIndex + 1) % 2;
        break;
      }

      case ompt_sync_region_taskwait: {
        if (Data->execution > 1)
          TsanHappensAfter(Data->GetTaskwaitPtr());
        break;
      }

      case ompt_sync_region_taskgroup: {
        assert(Data->TaskGroup != nullptr &&
               "Should have at least one taskgroup!");

        TsanHappensAfter(Data->TaskGroup->GetPtr());

        // Delete this allocated taskgroup, all descendent task are finished by
        // now.
        Taskgroup *Parent = Data->TaskGroup->Parent;
        delete Data->TaskGroup;
        Data->TaskGroup = Parent;
        break;
      }

      default:
        break;
    }
    break;
  }
}

static void ompt_tsan_reduction(ompt_sync_region_t kind,
                                ompt_scope_endpoint_t endpoint,
                                ompt_data_t *parallel_data,
                                ompt_data_t *task_data,
                                const void *codeptr_ra) {
  switch (endpoint) {
  case ompt_scope_begin:
    switch (kind) {
      case ompt_sync_region_reduction:
        TsanIgnoreWritesBegin();
        break;
      default:
        break;
    }
    break;
  case ompt_scope_end:
    switch (kind) {
      case ompt_sync_region_reduction:
        TsanIgnoreWritesEnd();
        break;
      default:
        break;
    }
    break;
  }
}

/// OMPT event callbacks for handling tasks.

static void ompt_tsan_task_create(
    ompt_data_t *parent_task_data, /* id of parent task            */
    const ompt_frame_t *parent_frame, /* frame data for parent task   */
    ompt_data_t *new_task_data, /* id of created task           */
    int type, int has_dependences,
    const void *codeptr_ra) /* pointer to outlined function */
{
  TaskData *Data;
  assert(new_task_data->ptr == NULL &&
         "Task data should be initialized to NULL");
  if (type & ompt_task_initial) {
    ompt_data_t *parallel_data;
    int team_size = 1;
    ompt_get_parallel_info(0, &parallel_data, &team_size);
    ParallelData *PData = new ParallelData(nullptr);
    parallel_data->ptr = PData;

    Data = new TaskData(PData);
    new_task_data->ptr = Data;
  } else if (type & ompt_task_undeferred) {
    Data = new TaskData(ToTaskData(parent_task_data));
    new_task_data->ptr = Data;
    Data->Included = true;
  } else if (type & ompt_task_explicit || type & ompt_task_target) {
    Data = new TaskData(ToTaskData(parent_task_data));
    new_task_data->ptr = Data;

    // Use the newly created address. We cannot use a single address from the
    // parent because that would declare wrong relationships with other
    // sibling tasks that may be created before this task is started!
    TsanHappensBefore(Data->GetTaskPtr());
    ToTaskData(parent_task_data)->execution++;
  }
}

static void ompt_tsan_task_schedule(ompt_data_t *first_task_data,
                                    ompt_task_status_t prior_task_status,
                                    ompt_data_t *second_task_data) {
  TaskData *FromTask = ToTaskData(first_task_data);
  TaskData *ToTask = ToTaskData(second_task_data);

  if (ToTask->Included && prior_task_status != ompt_task_complete)
    return; // No further synchronization for begin included tasks
  if (FromTask->Included && prior_task_status == ompt_task_complete) {
    // Just delete the task:
    while (FromTask != nullptr && --FromTask->RefCount == 0) {
      TaskData *Parent = FromTask->Parent;
      if (FromTask->DependencyCount > 0) {
        delete[] FromTask->Dependencies;
      }
      delete FromTask;
      FromTask = Parent;
    }
    return;
  }

  if (ToTask->execution == 0) {
    ToTask->execution++;
    // 1. Task will begin execution after it has been created.
    TsanHappensAfter(ToTask->GetTaskPtr());
    for (unsigned i = 0; i < ToTask->DependencyCount; i++) {
      ompt_dependence_t *Dependency = &ToTask->Dependencies[i];

      TsanHappensAfter(Dependency->variable.ptr);
      // in and inout dependencies are also blocked by prior in dependencies!
      if (Dependency->dependence_type == ompt_dependence_type_out || Dependency->dependence_type == ompt_dependence_type_inout) {
        TsanHappensAfter(ToInAddr(Dependency->variable.ptr));
      }
    }
  } else {
    // 2. Task will resume after it has been switched away.
    TsanHappensAfter(ToTask->GetTaskPtr());
  }

  if (prior_task_status != ompt_task_complete) {
    ToTask->ImplicitTask = FromTask->ImplicitTask;
    assert(ToTask->ImplicitTask != NULL &&
           "A task belongs to a team and has an implicit task on the stack");
  }

  // Task may be resumed at a later point in time.
  TsanHappensBefore(FromTask->GetTaskPtr());

  if (hasReductionCallback < ompt_set_always && FromTask->InBarrier) {
    // We want to ignore writes in the runtime code during barriers,
    // but not when executing tasks with user code!
    TsanIgnoreWritesEnd();
  }

  if (prior_task_status == ompt_task_complete) { // task finished

    // Task will finish before a barrier in the surrounding parallel region ...
    ParallelData *PData = FromTask->Team;
    TsanHappensBefore(
        PData->GetBarrierPtr(FromTask->ImplicitTask->BarrierIndex));

    // ... and before an eventual taskwait by the parent thread.
    TsanHappensBefore(FromTask->Parent->GetTaskwaitPtr());

    if (FromTask->TaskGroup != nullptr) {
      // This task is part of a taskgroup, so it will finish before the
      // corresponding taskgroup_end.
      TsanHappensBefore(FromTask->TaskGroup->GetPtr());
    }
    for (unsigned i = 0; i < FromTask->DependencyCount; i++) {
      ompt_dependence_t *Dependency = &FromTask->Dependencies[i];

      // in dependencies block following inout and out dependencies!
      TsanHappensBefore(ToInAddr(Dependency->variable.ptr));
      if (Dependency->dependence_type == ompt_dependence_type_out || Dependency->dependence_type == ompt_dependence_type_inout) {
        TsanHappensBefore(Dependency->variable.ptr);
      }
    }
    while (FromTask != nullptr && --FromTask->RefCount == 0) {
      TaskData *Parent = FromTask->Parent;
      if (FromTask->DependencyCount > 0) {
        delete[] FromTask->Dependencies;
      }
      delete FromTask;
      FromTask = Parent;
    }
  }
  if (hasReductionCallback < ompt_set_always && ToTask->InBarrier) {
    // We re-enter runtime code which currently performs a barrier.
    TsanIgnoreWritesBegin();
  }
}

static void ompt_tsan_dependences(ompt_data_t *task_data,
                                  const ompt_dependence_t *deps,
                                  int ndeps) {
  if (ndeps > 0) {
    // Copy the data to use it in task_switch and task_end.
    TaskData *Data = ToTaskData(task_data);
    Data->Dependencies = new ompt_dependence_t[ndeps];
    std::memcpy(Data->Dependencies, deps,
                sizeof(ompt_dependence_t) * ndeps);
    Data->DependencyCount = ndeps;

    // This callback is executed before this task is first started.
    TsanHappensBefore(Data->GetTaskPtr());
  }
}

/// OMPT event callbacks for handling locking.
static void ompt_tsan_mutex_acquired(ompt_mutex_t kind,
                                     ompt_wait_id_t wait_id,
                                     const void *codeptr_ra) {

  // Acquire our own lock to make sure that
  // 1. the previous release has finished.
  // 2. the next acquire doesn't start before we have finished our release.
    LocksMutex.lock();
    std::mutex &Lock = Locks[wait_id];
    LocksMutex.unlock();

    Lock.lock();
    TsanHappensAfter(&Lock);
}

static void ompt_tsan_mutex_released(ompt_mutex_t kind,
                                     ompt_wait_id_t wait_id,
                                     const void *codeptr_ra) {
    LocksMutex.lock();
    std::mutex &Lock = Locks[wait_id];
    LocksMutex.unlock();
    TsanHappensBefore(&Lock);

    Lock.unlock();
}

// callback , signature , variable to store result , required support level
#define SET_OPTIONAL_CALLBACK_T(event, type, result, level)                             \
  do {                                                                                  \
    ompt_callback_##type##_t tsan_##event = &ompt_tsan_##event;                         \
    result = ompt_set_callback(ompt_callback_##event,                                   \
                                (ompt_callback_t)tsan_##event);                         \
    if (result < level)                                                                 \
      printf("Registered callback '" #event "' is not supported at " #level " (%i)\n",  \
             result);                                                                   \
  } while (0)

#define SET_CALLBACK_T(event, type)                              \
  do {                                                           \
    int res;                                                     \
    SET_OPTIONAL_CALLBACK_T(event, type, res, ompt_set_always);  \
  } while (0)

#define SET_CALLBACK(event) SET_CALLBACK_T(event, event)

static int ompt_tsan_initialize(ompt_function_lookup_t lookup,
                                int device_num,
                                ompt_data_t *tool_data) {
  const char *options = getenv("TSAN_OPTIONS");
  TsanFlags tsan_flags(options);

  ompt_set_callback_t ompt_set_callback =
      (ompt_set_callback_t)lookup("ompt_set_callback");
  if (ompt_set_callback == NULL) {
    std::cerr << "Could not set callback, exiting..." << std::endl;
    std::exit(1);
  }
  ompt_get_parallel_info =
      (ompt_get_parallel_info_t)lookup("ompt_get_parallel_info");
  ompt_get_thread_data = (ompt_get_thread_data_t)lookup("ompt_get_thread_data");

  if (ompt_get_parallel_info == NULL) {
    fprintf(stderr, "Could not get inquiry function 'ompt_get_parallel_info', "
                    "exiting...\n");
    exit(1);
  }

  SET_CALLBACK(thread_begin);
  SET_CALLBACK(thread_end);
  SET_CALLBACK(parallel_begin);
  SET_CALLBACK(implicit_task);
  SET_CALLBACK(sync_region);
  SET_CALLBACK(parallel_end);

  SET_CALLBACK(task_create);
  SET_CALLBACK(task_schedule);
  SET_CALLBACK(dependences);

  SET_CALLBACK_T(mutex_acquired, mutex);
  SET_CALLBACK_T(mutex_released, mutex);
  SET_OPTIONAL_CALLBACK_T(reduction, sync_region, hasReductionCallback, ompt_set_never);

  if (!tsan_flags.ignore_noninstrumented_modules)
    fprintf(
        stderr,
        "Warning: please export TSAN_OPTIONS='ignore_noninstrumented_modules=1' "
        "to avoid false positive reports from the OpenMP runtime.!\n");
  return 1; // success
}

static void ompt_tsan_finalize(ompt_data_t *tool_data) {
  if (archer_flags->print_max_rss) {
    struct rusage end;
    getrusage(RUSAGE_SELF, &end);
    printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss);
  }

  if (archer_flags)
    delete archer_flags;
}

extern "C"
ompt_start_tool_result_t *ompt_start_tool(unsigned int omp_version,
                                          const char *runtime_version) {
  const char *options = getenv("ARCHER_OPTIONS");
  archer_flags = new ArcherFlags(options);
  if (!archer_flags->enabled)
  {
    if (archer_flags->verbose)
      std::cout << "Archer disabled, stopping operation"
                << std::endl;
    delete archer_flags;
    return NULL;
  }
  
  static ompt_start_tool_result_t ompt_start_tool_result = {
      &ompt_tsan_initialize, &ompt_tsan_finalize, {0}};
  runOnTsan=1;
  RunningOnValgrind();
  if (!runOnTsan) // if we are not running on TSAN, give a different tool the
    // chance to be loaded
  {
    if (archer_flags->verbose)
      std::cout << "Archer detected OpenMP application without TSan "
                   "stopping operation"
                << std::endl;
    delete archer_flags;
    return NULL;
  }

  if (archer_flags->verbose)
    std::cout << "Archer detected OpenMP application with TSan, supplying "
                 "OpenMP synchronization semantics"
              << std::endl;
  return &ompt_start_tool_result;
}