Although the code shown in this section is not really complicated, it's a more real one and allow us to check performance aspects. The goal of this example is simply to calculate the mean of a very big vector. We will have a template function to abstract from execution agent. This is the code:

template <class ExecutorType> void _testMeanVector(::mel::execution::ExFuture<ExecutorType,vector<double>> fut,tests::BaseTest* test)
{
    typedef vector<double> VectorType;
    try
    {
        Timer timer;
        uint64_t t0 = timer.getMilliseconds();
 
        auto res5 = mel::core::waitForFutureThread( 
        fut
        | mel::execution::parallel_convert(  //calculate mean in 4 parts
            [](const VectorType& v) noexcept
            {
                double mean = 0.f;
                size_t tam = v.size()/4;
                size_t endIdx = tam;
                for(size_t i = 0; i < endIdx;++i)
                    mean += v[i];
                mean /= v.size();
                return mean;
            },
            [](const VectorType& v) noexcept
            {
                double mean = 0.f;
                size_t tam = v.size()/4;
                size_t startIdx = tam;
                size_t endIdx = tam*2;
                for(size_t i = startIdx; i < endIdx;++i)
                    mean += v[i];
                mean /= v.size();
                return mean;
            },
            [](const VectorType& v) noexcept
            {
                double mean = 0.f;
                size_t tam = v.size()/4;
                size_t startIdx = tam*2;
                size_t endIdx = tam*3;
                for(size_t i = startIdx; i < endIdx;++i)
                    mean += v[i];
                mean /= v.size();
                return mean;
            },
            [](const VectorType& v) noexcept
            {
                double mean = 0.f;
                size_t tam = v.size()/4;
                size_t startIdx = tam*3;
                size_t endIdx = v.size();
                for(size_t i = startIdx; i < endIdx;++i)
                    mean += v[i];
                mean /= v.size();
                return mean;
            }
        ) | mel::execution::next( [](const std::tuple<double,double,double,double>& means)
        {
            return (std::get<0>(means)+std::get<1>(means)+std::get<2>(means)+std::get<3>(means));
        }));
        uint64_t t1 = timer.getMilliseconds();
        text::info("Mean = {}. Time spent = {} seconds",res5.value(),(float)((t1-t0)/1000.f));
    }catch(std::exception& e)
    {
        text::info("Error = {}",e.what());
    }
}

And from the main function:

Runnable::RunnableCreationOptions opts;
opts.schedulerOpts = ProcessScheduler::LockFreeOptions{};
auto th1 = ThreadRunnable::create(true,opts);
execution::Executor<Runnable> exr(th1);
typedef vector<double> VectorType;
auto initFut = execution::launch(exr,[]()
{               
    //int vecSize = std::rand()%1000'000; //generate random big vector
    constexpr int vecSize = 100'000'000;
    VectorType v(vecSize);
    for( size_t i = 0; i < vecSize;++i)
        v[i] = (std::rand()%20)/3.0; //to create a double
    return v;
});
core::waitForFutureThread(initFut); //wait for completion of vector creation to not interfere in time measurement
auto initRes = core::waitForFutureThread<mel::core::WaitErrorNoException>( initFut );
Timer timer;
uint64_t t0 = timer.getMilliseconds();
 
//plain version
text::info("vector mean: plain way");
double mean = 0.0;
{
    auto& v = initRes.value();
    size_t endIdx = v.size();
    for(size_t i = 0; i < endIdx;++i)
        mean+=v[i];
    mean/=v.size(); 
}   
uint64_t t1 = timer.getMilliseconds();
mel::text::info("Mean = {}. Time spent = {}",mean,(float)((t1-t0)/1000.f)); 
{       
    parallelism::ThreadPool::ThreadPoolOpts opts;
    opts.threadOpts.schedulerOpts = ProcessScheduler::LockFreeOptions{};
    auto myPool = make_shared<parallelism::ThreadPool>(opts);
    parallelism::ThreadPool::ExecutionOpts exopts;
    execution::Executor<parallelism::ThreadPool> extp(myPool);
    extp.setOpts({true,true});
    text::info("vector mean: ThreadPoolExecutor");
    _testMeanVector( execution::transfer(initFut,extp),"vector mean: ThreadPoolExecutor",test);
}
{
    exr.setOpts({true,false});
    text::info("vector mean: RunnableExecutor");        
    _testMeanVector(execution::transfer(initFut,exr),"vector mean: RunnableExecutor",test); //the transfer is not neccesary because initFut is launched in exr, but jsut in case it changes
}
{       
    execution::InlineExecutor ex;
    text::info("vector mean: InlineExecutor");      
    _testMeanVector( execution::transfer(initFut,ex),"vector mean: InlineExecutor",test);
}

In this code, a very big vector is created, and the mean of its values is calculated by:

spliting the vector in 4 parts and calculating the mean on each. this is done in using parallel_convert with 4 parallel tasks, each of which returns its own mean, so the full result of this job is a tuple.
passing the previous tuple with next to a callable which sums the four measurements.

Also, at first, a plain calculation is done to compare de results. In the concrete machine uses at this moment, the output is:

vector mean: plain way
    Mean = 3.1669570700186305. Time spent = 0.092
vector mean: ThreadPoolExecutor
    Mean = 3.166957070004096. Time spent = 0.044 seconds
vector mean: RunnableExecutor
    Mean = 3.166957070004096. Time spent = 0.093 seconds
vector mean: InlineExecutor
    Mean = 3.166957070004096. Time spent = 0.091 seconds

The results show two very good news:

using a ThreadPoolExecutor is more than double faster than a simple RunnableExecutor. By default a ThreadPool uses all available cores. In this used machine, the number of cores is 8, so each task in parallel_convert is fully independent. Although apparently this should mean that the code should be 4 times faster, other things can affect performance, mainly cache issues.
the direct way and the InlineExecutor takes almost the same time that the RunnableExecutor way, meaning that the internals of the execution and microthread system has very,very low penalty.

In order to do improve the previous code, it had been better to take advantage of concrete parallelism possibilities of the system. Instead of dividing the input vector in 4 different parts, It would be better to divide in as much parts as the system can. The code could be:

try
    {
        Timer timer;
        uint64_t t0 = timer.getMilliseconds();
        int numParts = ExecutorTraits<Executor<ExecutorType>>::has_parallelism?mel::core::getNumProcessors():1;
        vector<double> means(numParts);
        auto res5 = mel::core::waitForFutureThread(
        fut
        | mel::execution::loop(
                        0,(int)numParts,
                        [&means,numParts](int idx,VectorType& v) noexcept
                        {
                            double mean = 0.f;
                            size_t tam = v.size()/numParts;  //size of each sub-vector
                            size_t startIdx = idx*tam;
                            size_t endIdx;
                            if ( idx == numParts-1)
                                endIdx = v.size();
                            else
                                endIdx = startIdx+tam;
                            for(size_t i = startIdx; i < endIdx;++i)
                                mean += v[i];
                            mean /= v.size();
                            means[idx] = mean;
                        },1
                    )
         | mel::execution::next( [&means](VectorType&)
         {
            double mean = 0.0;
            for(size_t i = 0; i < means.size();++i)
                mean+=means[i];
            return mean; 
         })     
        );
        uint64_t t1 = timer.getMilliseconds();
        text::info("Using loop. Mean = {}. Time spent = {} seconds",res5.value(),(float)((t1-t0)/1000.f));
    }catch(std::exception& e)
    {
        text::info("Error = {}",e.what());
    }

In this new version, loop is used, doing as much iterations as available cores. Also, the parallelism capabilities of the used executor is checked with ExecutorTraits, such that, with no real parallelism , dividing processing in chunks doesn't0 make any advantage (also the loss is minimal as was seen in the previous example).