/**
* Copyright (C) Mellanox Technologies Ltd. 2001-2014.  ALL RIGHTS RESERVED.
*
* Copyright (C) UT-Battelle, LLC. 2015. ALL RIGHTS RESERVED.
* See file LICENSE for terms.
*/

#include "uct_test.h"

#include <gtest/common/test_perf.h>
extern "C" {
#include <ucs/arch/cpu.h>
}

#define MB                        pow(1024, -2)
#define UCT_PERF_TEST_MULTIPLIER  5
#define UCT_ARM_PERF_TEST_MULTIPLIER  15

class test_uct_perf : public uct_test, public test_perf {
protected:
    const static test_spec tests[];
};


const test_perf::test_spec test_uct_perf::tests[] =
{
  { "am short latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5,
    0 },

  { "am short rate", "Mpps",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu,
    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
    0 },

  { "am short rate64", "Mpps",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 64 }, 1, 2000000lu,
    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
    0 },

  { "am short iov latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 4, 4 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5,
    0 },

  { "am short iov rate", "Mpps",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 4, 4 }, 1, 2000000lu,
    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
    0 },

  { "am short iov rate64", "Mpps",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT_IOV, 0, 2, { 32, 32 }, 1, 2000000lu,
    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
    0 },

  { "am bcopy latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_PINGPONG,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 8 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 2.5},

  { "am bcopy bw", "MB/sec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 1000 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 15000.0,
    0 },

  { "am zcopy bw", "MB/sec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000lu,
    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0,
    0 },

  { "am zcopy bw flush ep", "MB/sec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 1000 }, 32, 100000lu,
    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0,
    UCX_PERF_TEST_FLAG_FLUSH_EP },

  { "put latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_PINGPONG,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 1.5,
    0 },

  { "put rate", "Mpps",
    UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu,
    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.8, 80.0,
    0 },

  { "put bcopy bw", "MB/sec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_BCOPY, 0, 1, { 2048 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 50000.0,
    0 },

  { "put zcopy bw", "MB/sec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_PUT, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 2048 }, 32, 100000lu,
    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 620.0, 50000.0,
    0 },

  { "get latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_GET, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_ZCOPY, 0, 1, { 8 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
    0 },

  { "atomic add latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_PINGPONG,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
    0 },

  { "atomic add rate", "Mpps",
    UCX_PERF_API_UCT, UCX_PERF_CMD_ADD, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 2000000lu,
    ucs_offsetof(ucx_perf_result_t, msgrate.total_average), 1e-6, 0.5, 50.0,
    0 },

  { "atomic fadd latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_FADD, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
    0 },

  { "atomic cswap latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_CSWAP, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
    0 },

  { "atomic swap latency", "usec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_SWAP, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_SHORT, 0, 1, { 8 }, 1, 100000lu,
    ucs_offsetof(ucx_perf_result_t, latency.total_average), 1e6, 0.01, 3.5,
    0 },

  { "am iov bw", "MB/sec",
    UCX_PERF_API_UCT, UCX_PERF_CMD_AM, UCX_PERF_TEST_TYPE_STREAM_UNI,
    UCX_PERF_WAIT_MODE_POLL,
    UCT_PERF_DATA_LAYOUT_ZCOPY, 8192, 3, { 256, 256, 512 }, 32, 100000lu,
    ucs_offsetof(ucx_perf_result_t, bandwidth.total_average), MB, 600.0, 15000.0,
    0 },

  { NULL }
};


UCS_TEST_P(test_uct_perf, envelope) {
    if (has_transport("cm") ||
        has_transport("ugni_udt")) {
        UCS_TEST_SKIP;
    }

    /* For SandyBridge CPUs, don't check performance of far-socket devices */
    std::vector<int> cpus = get_affinity();
    bool check_perf       = true;
    size_t max_iter       = std::numeric_limits<size_t>::max();

    if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_INTEL_SANDYBRIDGE) {
        for (std::vector<int>::iterator iter = cpus.begin(); iter != cpus.end(); ++iter) {
            if (!ucs_cpu_is_set(*iter, &GetParam()->local_cpus)) {
                UCS_TEST_MESSAGE << "Not enforcing performance on SandyBridge far socket";
                check_perf = false;
                break;
            }
        }
    }

    if (has_transport("tcp")) {
        check_perf = false; /* TODO calibrate expected performance based on transport */
        max_iter   = 1000lu;
    }

    /* Run all tests */
    for (const test_spec *test_iter = tests; test_iter->title != NULL; ++test_iter) {
        test_spec test = *test_iter;

        if (ucs_arch_get_cpu_model() == UCS_CPU_MODEL_ARM_AARCH64) {
            test.max *= UCT_ARM_PERF_TEST_MULTIPLIER;
            test.min /= UCT_ARM_PERF_TEST_MULTIPLIER;
        } else {
            test.max *= UCT_PERF_TEST_MULTIPLIER;
            test.min /= UCT_PERF_TEST_MULTIPLIER;
        }
        test.iters = ucs_min(test.iters, max_iter);
        run_test(test, 0, check_perf, GetParam()->tl_name, GetParam()->dev_name);
    }
}

UCT_INSTANTIATE_NO_SELF_TEST_CASE(test_uct_perf);
