Modulating an oscillator by gain, pitch, and offset











up vote
1
down vote

favorite












I've an Audio Host application that sends block of samples to a DLL I'm building. It sends 44100 samples per seconds (i.e. Sample Rate 44100hz), per block, processing 16 distinct Voices.



It processes an Oscillator, which plays a basic sine wave, modulated at Audio Rate by Gain, Pitch and Offset.



Here's the C++ code I have (I'm on MSVC, Release/x86 Configuration with /02 (/Ot) optimized settings):



#include <iostream>
#include <cstring>
#include <cmath>
#include <chrono>

const int voiceSize = 16;
const int bufferSize = 256;
const double pi = 3.141592653589793238;
const double twopi = 2 * pi;
double sampleRate = 44100.0;
double noteFrequency = 130.81278;
double hostPitch = 1.0;

#define BOUNDED(x,lo,hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))

class Param
{
public:
int mControlRate = 1;
double mValue = 0.5;
double mProcessedValues[voiceSize][bufferSize];
double *pModValues;

Param(double min, double max) {
mMin = min;
mMax = max;
mRange = max - min;
}

inline double GetMin() { return mMin; }
inline double GetRange() { return mRange; }
inline double GetProcessedVoiceValue(int voiceIndex, int sampleIndex) { return mProcessedVoicesValues[voiceIndex][sampleIndex]; }

inline void AddModulation(int voiceIndex, int blockSize) {
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
double *pProcessedValues = mProcessedValues[voiceIndex];

int i = 0;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += mControlRate, i++) {
pProcessedValues[i] = BOUNDED(mValue + pModVoiceValues[sampleIndex], 0.0, 1.0);
}
}

private:
double mMin, mMax, mRange;
double mProcessedVoicesValues[voiceSize][bufferSize];
};
class Oscillator
{
public:
double mRadiansPerSample = twopi / sampleRate;
double ln2per12 = std::log(2.0) / 12.0;
double mPhase[voiceSize];

Param *pGain, *pOffset, *pPitch;

Oscillator() {
pGain = new Param(0.0, 1.0);
pOffset = new Param(-900.0, 900.0);
pPitch = new Param(-48.0, 48.0);

// reset osc phase (start at 0.0)
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
Reset(voiceIndex);
}
}
~Oscillator() {
delete pGain;
delete pOffset;
delete pPitch;
}

void Reset(int voiceIndex) {
mPhase[voiceIndex] = 0.0;
}
void ProcessVoiceBlock(int voiceIndex, int blockSize, double noteFrequency, double *left, double *right) {
// local copy
double phase = mPhase[voiceIndex];
double offsetMin = pOffset->GetMin();
double offsetRange = pOffset->GetRange();
double pitchMin = pPitch->GetMin();
double pitchRange = pPitch->GetRange();

// precomputed data
double bp0 = noteFrequency * hostPitch;

// process block values
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
double value = (sin(phase)) * pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

*left++ += value;
*right++ += value;

// next phase
phase += BOUNDED(mRadiansPerSample * (bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex, sampleIndex), pitchMin, pitchRange) + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex, sampleIndex), offsetMin, offsetRange)), 0, pi);
while (phase >= twopi) { phase -= twopi; }
}

// revert local copy
mPhase[voiceIndex] = phase;
}
inline double GetOffsetWarped(double normalizedValue, double min, double range) { return min + normalizedValue * range; }
inline double GetPitchWarped(double normalizedValue, double min, double range) { return exp((min + normalizedValue * range) * ln2per12); }
};

class MyPlugin
{
public:
double gainModValues[voiceSize][bufferSize];
double offsetModValues[voiceSize][bufferSize];
double pitchModValues[voiceSize][bufferSize];
Oscillator oscillator;

MyPlugin() {
// link mod arrays to params
oscillator.pGain->pModValues = gainModValues[0];
oscillator.pOffset->pModValues = offsetModValues[0];
oscillator.pPitch->pModValues = pitchModValues[0];

// some fancy data for mod
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
gainModValues[voiceIndex][sampleIndex] = sampleIndex / (double)bufferSize;
}
}
}

void ProcessDoubleReplace(int blockSize, double *bufferLeft, double *bufferRight) {
// init buffer
memset(bufferLeft, 0, blockSize * sizeof(double));
memset(bufferRight, 0, blockSize * sizeof(double));

// voices
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
// envelopes - here's where mod values will change, at audio rate

// add mod to params
oscillator.pGain->AddModulation(voiceIndex, blockSize);
oscillator.pOffset->AddModulation(voiceIndex, blockSize);
oscillator.pPitch->AddModulation(voiceIndex, blockSize);

// osc buffer
oscillator.ProcessVoiceBlock(voiceIndex, blockSize, noteFrequency, bufferLeft, bufferRight);
}
}
};

int main(int argc, const char *argv) {
double bufferLeft[bufferSize];
double bufferRight[bufferSize];

MyPlugin myPlugin;

// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);

// do somethings with buffer

}
std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
}


Considering I'm running 16 Voices contemporaneously, it takes an huge amount of CPU for a simple Gain/Pitch/Offset modulation.



I hope I can do it better. Any tips/suggestions? Vectorizing?



Note: I could use std::clamp and C++17, but it doesn't matter a lot here: nothing really change with that expedient.










share|improve this question




















  • 1




    I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
    – Toby Speight
    2 days ago















up vote
1
down vote

favorite












I've an Audio Host application that sends block of samples to a DLL I'm building. It sends 44100 samples per seconds (i.e. Sample Rate 44100hz), per block, processing 16 distinct Voices.



It processes an Oscillator, which plays a basic sine wave, modulated at Audio Rate by Gain, Pitch and Offset.



Here's the C++ code I have (I'm on MSVC, Release/x86 Configuration with /02 (/Ot) optimized settings):



#include <iostream>
#include <cstring>
#include <cmath>
#include <chrono>

const int voiceSize = 16;
const int bufferSize = 256;
const double pi = 3.141592653589793238;
const double twopi = 2 * pi;
double sampleRate = 44100.0;
double noteFrequency = 130.81278;
double hostPitch = 1.0;

#define BOUNDED(x,lo,hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))

class Param
{
public:
int mControlRate = 1;
double mValue = 0.5;
double mProcessedValues[voiceSize][bufferSize];
double *pModValues;

Param(double min, double max) {
mMin = min;
mMax = max;
mRange = max - min;
}

inline double GetMin() { return mMin; }
inline double GetRange() { return mRange; }
inline double GetProcessedVoiceValue(int voiceIndex, int sampleIndex) { return mProcessedVoicesValues[voiceIndex][sampleIndex]; }

inline void AddModulation(int voiceIndex, int blockSize) {
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
double *pProcessedValues = mProcessedValues[voiceIndex];

int i = 0;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += mControlRate, i++) {
pProcessedValues[i] = BOUNDED(mValue + pModVoiceValues[sampleIndex], 0.0, 1.0);
}
}

private:
double mMin, mMax, mRange;
double mProcessedVoicesValues[voiceSize][bufferSize];
};
class Oscillator
{
public:
double mRadiansPerSample = twopi / sampleRate;
double ln2per12 = std::log(2.0) / 12.0;
double mPhase[voiceSize];

Param *pGain, *pOffset, *pPitch;

Oscillator() {
pGain = new Param(0.0, 1.0);
pOffset = new Param(-900.0, 900.0);
pPitch = new Param(-48.0, 48.0);

// reset osc phase (start at 0.0)
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
Reset(voiceIndex);
}
}
~Oscillator() {
delete pGain;
delete pOffset;
delete pPitch;
}

void Reset(int voiceIndex) {
mPhase[voiceIndex] = 0.0;
}
void ProcessVoiceBlock(int voiceIndex, int blockSize, double noteFrequency, double *left, double *right) {
// local copy
double phase = mPhase[voiceIndex];
double offsetMin = pOffset->GetMin();
double offsetRange = pOffset->GetRange();
double pitchMin = pPitch->GetMin();
double pitchRange = pPitch->GetRange();

// precomputed data
double bp0 = noteFrequency * hostPitch;

// process block values
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
double value = (sin(phase)) * pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

*left++ += value;
*right++ += value;

// next phase
phase += BOUNDED(mRadiansPerSample * (bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex, sampleIndex), pitchMin, pitchRange) + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex, sampleIndex), offsetMin, offsetRange)), 0, pi);
while (phase >= twopi) { phase -= twopi; }
}

// revert local copy
mPhase[voiceIndex] = phase;
}
inline double GetOffsetWarped(double normalizedValue, double min, double range) { return min + normalizedValue * range; }
inline double GetPitchWarped(double normalizedValue, double min, double range) { return exp((min + normalizedValue * range) * ln2per12); }
};

class MyPlugin
{
public:
double gainModValues[voiceSize][bufferSize];
double offsetModValues[voiceSize][bufferSize];
double pitchModValues[voiceSize][bufferSize];
Oscillator oscillator;

MyPlugin() {
// link mod arrays to params
oscillator.pGain->pModValues = gainModValues[0];
oscillator.pOffset->pModValues = offsetModValues[0];
oscillator.pPitch->pModValues = pitchModValues[0];

// some fancy data for mod
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
gainModValues[voiceIndex][sampleIndex] = sampleIndex / (double)bufferSize;
}
}
}

void ProcessDoubleReplace(int blockSize, double *bufferLeft, double *bufferRight) {
// init buffer
memset(bufferLeft, 0, blockSize * sizeof(double));
memset(bufferRight, 0, blockSize * sizeof(double));

// voices
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
// envelopes - here's where mod values will change, at audio rate

// add mod to params
oscillator.pGain->AddModulation(voiceIndex, blockSize);
oscillator.pOffset->AddModulation(voiceIndex, blockSize);
oscillator.pPitch->AddModulation(voiceIndex, blockSize);

// osc buffer
oscillator.ProcessVoiceBlock(voiceIndex, blockSize, noteFrequency, bufferLeft, bufferRight);
}
}
};

int main(int argc, const char *argv) {
double bufferLeft[bufferSize];
double bufferRight[bufferSize];

MyPlugin myPlugin;

// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);

// do somethings with buffer

}
std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
}


Considering I'm running 16 Voices contemporaneously, it takes an huge amount of CPU for a simple Gain/Pitch/Offset modulation.



I hope I can do it better. Any tips/suggestions? Vectorizing?



Note: I could use std::clamp and C++17, but it doesn't matter a lot here: nothing really change with that expedient.










share|improve this question




















  • 1




    I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
    – Toby Speight
    2 days ago













up vote
1
down vote

favorite









up vote
1
down vote

favorite











I've an Audio Host application that sends block of samples to a DLL I'm building. It sends 44100 samples per seconds (i.e. Sample Rate 44100hz), per block, processing 16 distinct Voices.



It processes an Oscillator, which plays a basic sine wave, modulated at Audio Rate by Gain, Pitch and Offset.



Here's the C++ code I have (I'm on MSVC, Release/x86 Configuration with /02 (/Ot) optimized settings):



#include <iostream>
#include <cstring>
#include <cmath>
#include <chrono>

const int voiceSize = 16;
const int bufferSize = 256;
const double pi = 3.141592653589793238;
const double twopi = 2 * pi;
double sampleRate = 44100.0;
double noteFrequency = 130.81278;
double hostPitch = 1.0;

#define BOUNDED(x,lo,hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))

class Param
{
public:
int mControlRate = 1;
double mValue = 0.5;
double mProcessedValues[voiceSize][bufferSize];
double *pModValues;

Param(double min, double max) {
mMin = min;
mMax = max;
mRange = max - min;
}

inline double GetMin() { return mMin; }
inline double GetRange() { return mRange; }
inline double GetProcessedVoiceValue(int voiceIndex, int sampleIndex) { return mProcessedVoicesValues[voiceIndex][sampleIndex]; }

inline void AddModulation(int voiceIndex, int blockSize) {
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
double *pProcessedValues = mProcessedValues[voiceIndex];

int i = 0;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += mControlRate, i++) {
pProcessedValues[i] = BOUNDED(mValue + pModVoiceValues[sampleIndex], 0.0, 1.0);
}
}

private:
double mMin, mMax, mRange;
double mProcessedVoicesValues[voiceSize][bufferSize];
};
class Oscillator
{
public:
double mRadiansPerSample = twopi / sampleRate;
double ln2per12 = std::log(2.0) / 12.0;
double mPhase[voiceSize];

Param *pGain, *pOffset, *pPitch;

Oscillator() {
pGain = new Param(0.0, 1.0);
pOffset = new Param(-900.0, 900.0);
pPitch = new Param(-48.0, 48.0);

// reset osc phase (start at 0.0)
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
Reset(voiceIndex);
}
}
~Oscillator() {
delete pGain;
delete pOffset;
delete pPitch;
}

void Reset(int voiceIndex) {
mPhase[voiceIndex] = 0.0;
}
void ProcessVoiceBlock(int voiceIndex, int blockSize, double noteFrequency, double *left, double *right) {
// local copy
double phase = mPhase[voiceIndex];
double offsetMin = pOffset->GetMin();
double offsetRange = pOffset->GetRange();
double pitchMin = pPitch->GetMin();
double pitchRange = pPitch->GetRange();

// precomputed data
double bp0 = noteFrequency * hostPitch;

// process block values
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
double value = (sin(phase)) * pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

*left++ += value;
*right++ += value;

// next phase
phase += BOUNDED(mRadiansPerSample * (bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex, sampleIndex), pitchMin, pitchRange) + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex, sampleIndex), offsetMin, offsetRange)), 0, pi);
while (phase >= twopi) { phase -= twopi; }
}

// revert local copy
mPhase[voiceIndex] = phase;
}
inline double GetOffsetWarped(double normalizedValue, double min, double range) { return min + normalizedValue * range; }
inline double GetPitchWarped(double normalizedValue, double min, double range) { return exp((min + normalizedValue * range) * ln2per12); }
};

class MyPlugin
{
public:
double gainModValues[voiceSize][bufferSize];
double offsetModValues[voiceSize][bufferSize];
double pitchModValues[voiceSize][bufferSize];
Oscillator oscillator;

MyPlugin() {
// link mod arrays to params
oscillator.pGain->pModValues = gainModValues[0];
oscillator.pOffset->pModValues = offsetModValues[0];
oscillator.pPitch->pModValues = pitchModValues[0];

// some fancy data for mod
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
gainModValues[voiceIndex][sampleIndex] = sampleIndex / (double)bufferSize;
}
}
}

void ProcessDoubleReplace(int blockSize, double *bufferLeft, double *bufferRight) {
// init buffer
memset(bufferLeft, 0, blockSize * sizeof(double));
memset(bufferRight, 0, blockSize * sizeof(double));

// voices
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
// envelopes - here's where mod values will change, at audio rate

// add mod to params
oscillator.pGain->AddModulation(voiceIndex, blockSize);
oscillator.pOffset->AddModulation(voiceIndex, blockSize);
oscillator.pPitch->AddModulation(voiceIndex, blockSize);

// osc buffer
oscillator.ProcessVoiceBlock(voiceIndex, blockSize, noteFrequency, bufferLeft, bufferRight);
}
}
};

int main(int argc, const char *argv) {
double bufferLeft[bufferSize];
double bufferRight[bufferSize];

MyPlugin myPlugin;

// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);

// do somethings with buffer

}
std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
}


Considering I'm running 16 Voices contemporaneously, it takes an huge amount of CPU for a simple Gain/Pitch/Offset modulation.



I hope I can do it better. Any tips/suggestions? Vectorizing?



Note: I could use std::clamp and C++17, but it doesn't matter a lot here: nothing really change with that expedient.










share|improve this question















I've an Audio Host application that sends block of samples to a DLL I'm building. It sends 44100 samples per seconds (i.e. Sample Rate 44100hz), per block, processing 16 distinct Voices.



It processes an Oscillator, which plays a basic sine wave, modulated at Audio Rate by Gain, Pitch and Offset.



Here's the C++ code I have (I'm on MSVC, Release/x86 Configuration with /02 (/Ot) optimized settings):



#include <iostream>
#include <cstring>
#include <cmath>
#include <chrono>

const int voiceSize = 16;
const int bufferSize = 256;
const double pi = 3.141592653589793238;
const double twopi = 2 * pi;
double sampleRate = 44100.0;
double noteFrequency = 130.81278;
double hostPitch = 1.0;

#define BOUNDED(x,lo,hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))

class Param
{
public:
int mControlRate = 1;
double mValue = 0.5;
double mProcessedValues[voiceSize][bufferSize];
double *pModValues;

Param(double min, double max) {
mMin = min;
mMax = max;
mRange = max - min;
}

inline double GetMin() { return mMin; }
inline double GetRange() { return mRange; }
inline double GetProcessedVoiceValue(int voiceIndex, int sampleIndex) { return mProcessedVoicesValues[voiceIndex][sampleIndex]; }

inline void AddModulation(int voiceIndex, int blockSize) {
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
double *pProcessedValues = mProcessedValues[voiceIndex];

int i = 0;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += mControlRate, i++) {
pProcessedValues[i] = BOUNDED(mValue + pModVoiceValues[sampleIndex], 0.0, 1.0);
}
}

private:
double mMin, mMax, mRange;
double mProcessedVoicesValues[voiceSize][bufferSize];
};
class Oscillator
{
public:
double mRadiansPerSample = twopi / sampleRate;
double ln2per12 = std::log(2.0) / 12.0;
double mPhase[voiceSize];

Param *pGain, *pOffset, *pPitch;

Oscillator() {
pGain = new Param(0.0, 1.0);
pOffset = new Param(-900.0, 900.0);
pPitch = new Param(-48.0, 48.0);

// reset osc phase (start at 0.0)
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
Reset(voiceIndex);
}
}
~Oscillator() {
delete pGain;
delete pOffset;
delete pPitch;
}

void Reset(int voiceIndex) {
mPhase[voiceIndex] = 0.0;
}
void ProcessVoiceBlock(int voiceIndex, int blockSize, double noteFrequency, double *left, double *right) {
// local copy
double phase = mPhase[voiceIndex];
double offsetMin = pOffset->GetMin();
double offsetRange = pOffset->GetRange();
double pitchMin = pPitch->GetMin();
double pitchRange = pPitch->GetRange();

// precomputed data
double bp0 = noteFrequency * hostPitch;

// process block values
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
double value = (sin(phase)) * pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

*left++ += value;
*right++ += value;

// next phase
phase += BOUNDED(mRadiansPerSample * (bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex, sampleIndex), pitchMin, pitchRange) + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex, sampleIndex), offsetMin, offsetRange)), 0, pi);
while (phase >= twopi) { phase -= twopi; }
}

// revert local copy
mPhase[voiceIndex] = phase;
}
inline double GetOffsetWarped(double normalizedValue, double min, double range) { return min + normalizedValue * range; }
inline double GetPitchWarped(double normalizedValue, double min, double range) { return exp((min + normalizedValue * range) * ln2per12); }
};

class MyPlugin
{
public:
double gainModValues[voiceSize][bufferSize];
double offsetModValues[voiceSize][bufferSize];
double pitchModValues[voiceSize][bufferSize];
Oscillator oscillator;

MyPlugin() {
// link mod arrays to params
oscillator.pGain->pModValues = gainModValues[0];
oscillator.pOffset->pModValues = offsetModValues[0];
oscillator.pPitch->pModValues = pitchModValues[0];

// some fancy data for mod
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
gainModValues[voiceIndex][sampleIndex] = sampleIndex / (double)bufferSize;
}
}
}

void ProcessDoubleReplace(int blockSize, double *bufferLeft, double *bufferRight) {
// init buffer
memset(bufferLeft, 0, blockSize * sizeof(double));
memset(bufferRight, 0, blockSize * sizeof(double));

// voices
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
// envelopes - here's where mod values will change, at audio rate

// add mod to params
oscillator.pGain->AddModulation(voiceIndex, blockSize);
oscillator.pOffset->AddModulation(voiceIndex, blockSize);
oscillator.pPitch->AddModulation(voiceIndex, blockSize);

// osc buffer
oscillator.ProcessVoiceBlock(voiceIndex, blockSize, noteFrequency, bufferLeft, bufferRight);
}
}
};

int main(int argc, const char *argv) {
double bufferLeft[bufferSize];
double bufferRight[bufferSize];

MyPlugin myPlugin;

// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);

// do somethings with buffer

}
std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
}


Considering I'm running 16 Voices contemporaneously, it takes an huge amount of CPU for a simple Gain/Pitch/Offset modulation.



I hope I can do it better. Any tips/suggestions? Vectorizing?



Note: I could use std::clamp and C++17, but it doesn't matter a lot here: nothing really change with that expedient.







c++ performance audio signal-processing






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited 2 days ago

























asked 2 days ago









markzzz

1285




1285








  • 1




    I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
    – Toby Speight
    2 days ago














  • 1




    I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
    – Toby Speight
    2 days ago








1




1




I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
– Toby Speight
2 days ago




I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
– Toby Speight
2 days ago










4 Answers
4






active

oldest

votes

















up vote
5
down vote














  • Don't define pi and twopi yourself. They should be available from math.h as M_PI and M_2_PI if you're in the GNU stack.

  • Defining functions as inline is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.


  • GetMin() should be GetMin() const. Same for GetRange, GetProcessedVoiceValue, and anything that doesn't modify a class member.


  • while (phase >= twopi) { phase -= twopi; } - This should not be a loop. It can be done in O(1), something like phase = fmod(phase, twopi) - but double-check this. For very small values of phase, fmod may actually be slower. You can also try phase -= twopi*int(phase/twopi) which is also O(1).

  • On this line:


double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];



You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:



double *pModVoiceValues = pModValues + voiceIndex*bufferSize;






share|improve this answer























  • M_PI and M_2_PI are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1), for example.
    – Toby Speight
    2 days ago










  • @TobySpeight Thanks; I guess those are Gnu-specific.
    – Reinderien
    2 days ago












  • @Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
    – markzzz
    2 days ago










  • @markzzz The fmod will improve program speed, if only imperceptibly.
    – Reinderien
    2 days ago










  • curious: fmod slow down the process here :O
    – markzzz
    2 days ago


















up vote
5
down vote













counting flops



Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.



double value = sin(phase) *
pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

*left++ += value;
*right++ += value;

// next phase
phase += BOUNDED(
mRadiansPerSample * (
bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
sampleIndex), pitchMin, pitchRange)
+ GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
sampleIndex), offsetMin, offsetRange)
),
0, pi
);

while (phase >= twopi) { phase -= twopi; }

// . . .

inline double GetOffsetWarped(double normalizedValue, double min, double range)
{
return min + normalizedValue * range;
}
inline double GetPitchWarped(double normalizedValue, double min, double range)
{
return exp((min + normalizedValue * range) * ln2per12);
}


Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12), where omega_C = TAU*440*2**(-2 + 3/12) is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:



# code sample 0

# Rescale user inputs.
pitches = pitch_min + pitches_raw*pitch_range
omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
gains = gain_min + gains_raw*gain_range

omegas = omega_C * 2**(pitches/12) + omega_offsets
values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)


That is basically the entire program. First, the constants.



dt is the time in seconds between samples. dt = 1/sample_rate, sample_rate = 44100. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.



phase0 is the oscillator's phase at the beginning of the block you are currently processing.



gains, pitches, and omega_offsets are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains ranges from 0 to 1, pitches ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.



You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.



Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.



In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.



small loops



Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.



vectorising



You're doing the same operations blockSize times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.



#include <mkl.h>

// determines the size of a static array.
#define SASIZE(sa) (sizeof(sa)/sizeof(*sa))

// . . .

float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
float sin_phases[SASIZE(phases)];
vsSin(SASIZE(phases), phases, sin_phases);
// Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
// all at once, about 4 or 8 at a time using simd instructions.


float



This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.



integral phase



You can store and manipulate phase using uint32_ts. It is naturally modular with modulus 2**32, so you never need to do any fmods or subtractions, especially repeated subtractions in a loop.



            while (phase >= twopi) { phase -= twopi; }  // idk about this...


That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.



#include <cstdio>  // Hey, I like *printf().
#include <cinttypes>

// These are actually templated C++ math functions. The c doesn't
// stand for C, I guess...
#include <cmath>

using namespace std;

#define TAU 6.283185307179586


// fmodf, but always rounding toward negative infinity.
// The output will always be in [0, y). This is not the case
// with fmod(). See man fmod.
float fmodf_rd(float x, float y)
{
float r = fmod(x, y);
return r < 0.f? r + y : r;
}
uint32_t integral_phase(float phi)
{
return phi / (float)TAU * 0x1p32f;
}
float float_phase(uint32_t phi_i)
{
return (float)phi_i * (float)TAU * 0x1p-32f;
}

int main(int argc, const char *argv)
{
// Phases, in radians.
float alpha = TAU/12., beta = atan(2.f);
uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);

// Phase calculations in radians and floating point:
float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
// The same phase calculation in integral phase. Note there is no
// mod operation at all.
uint32_t gamma_i = 5*alpha_i - 7*beta_i;


printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
return 0;
}


Epilogue: An M. Night Shyamalan twist.



This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono badness and replaced it with dtime(), a wrapper I wrote around clock_gettime(). dtime() returns the current value of CLOCK_MONOTONIC in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:



--- mod.cpp-    2018-11-20 03:19:11.091296221 -0500
+++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
@@ -1,7 +1,21 @@
-#include <iostream>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
#include <cstring>
#include <cmath>
-#include <chrono>
+
+double timespec_to_d(struct timespec *ts)
+{
+ return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
+}
+double dtime(void)
+{
+ struct timespec ts;
+ int res = clock_gettime(CLOCK_MONOTONIC, &ts);
+ assert(res == 0);
+ return timespec_to_d(&ts);
+}

const int voiceSize = 16;
const int bufferSize = 256;
@@ -154,7 +168,7 @@
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
- std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
+ double t0 = dtime();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
@@ -162,6 +176,8 @@
// do somethings with buffer

}
- std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
- std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
+ double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
+ dt_total = 1./sampleRate;
+ printf("CPU = %.6g %%n", dt_busy/dt_total);
+ return 0;
}


dt_busy is the total amount of time your code takes to process a single sample. dt_total, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.



aaaaaaand this is the output:



% g++ -o mod mod.cpp
% ./mod
CPU = 0.105791 %
%


When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.






share|improve this answer










New contributor




enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.


















  • Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
    – markzzz
    yesterday










  • Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
    – markzzz
    yesterday










  • Yeah, when set to zero. For every other value, though, lol
    – enigmaticPhysicist
    yesterday










  • for electronic music and "kickdrum synth" is a good tweak :)
    – markzzz
    yesterday










  • also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
    – markzzz
    yesterday


















up vote
4
down vote













General overview



I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.



The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue() doesn't indicate what kind of processing was done; ln2per12 only captures how it was computed, rather than what it's for, and MyPlugin looks like a generic temporary name.



Specific issues



BOUNDED() is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp() or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.



Variable-length arrays are not standard C++. Fix this by making voiceSize and bufferSize compile-time constants, aka constexpr (rather than plain const).



Prefer initialisation to assignment. In Param, for example, we could make mMin, mMax and mRange const if we initialize:



    Param(double min, double max)
: mMin{min},
// mMax was never used, so don't store it
mRange{max - min}
{
}

// make them public, so we don't need `get_min()` etc.
const double mMin, mRange;


The uninitialised pModValues pointer is especially dangerous here.



I don't understand why we're updating left and right identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy() to duplicate it afterwards.



In Oscillator, there's no need for pGain, pOffset and pPitch to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc) by making them simple member variables - don't write Java code!



Param pGain, pOffset, pPitch;

Oscillator()
: pGain{0.0, 1.0},
pOffset{-900.0, 900.0},
pPitch{-48.0, 48.0}
{ /* ... */ }

// destructor is now defaulted


Some functions from <cmath> wrongly accessed in the global namespace (namely std::sin() and std::exp()); also std::memset() from <cstring>. These should be correctly qualified for portable code.



Prefer std::fill() or std::fill_n() over std::memset() - the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:



    std::fill_n(bufferLeft, blockSize, 0);
std::fill_n(bufferRight, blockSize, 0);


Declare main() with no arguments when not intending to process command-line options.



If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native to my compilation command).






share|improve this answer























  • BOUNDED: What do you mean "evalutate its arguments more than once"?
    – markzzz
    2 days ago










  • BOUNDED expands x three times. That's misleading if x is an expression with side-effects (e.g. *p++) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
    – Toby Speight
    2 days ago












  • I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
    – markzzz
    2 days ago










  • The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
    – markzzz
    2 days ago




















up vote
1
down vote













I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.



$$12 log_2 frac {130.81278} {55} approx 15$$



$$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$



$$f approx 130.812,782,650,299...$$



In C,



const double noteFrequency = 110*pow(2, 0.25);





share|improve this answer























    Your Answer





    StackExchange.ifUsing("editor", function () {
    return StackExchange.using("mathjaxEditing", function () {
    StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
    StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
    });
    });
    }, "mathjax-editing");

    StackExchange.ifUsing("editor", function () {
    StackExchange.using("externalEditor", function () {
    StackExchange.using("snippets", function () {
    StackExchange.snippets.init();
    });
    });
    }, "code-snippets");

    StackExchange.ready(function() {
    var channelOptions = {
    tags: "".split(" "),
    id: "196"
    };
    initTagRenderer("".split(" "), "".split(" "), channelOptions);

    StackExchange.using("externalEditor", function() {
    // Have to fire editor after snippets, if snippets enabled
    if (StackExchange.settings.snippets.snippetsEnabled) {
    StackExchange.using("snippets", function() {
    createEditor();
    });
    }
    else {
    createEditor();
    }
    });

    function createEditor() {
    StackExchange.prepareEditor({
    heartbeatType: 'answer',
    convertImagesToLinks: false,
    noModals: true,
    showLowRepImageUploadWarning: true,
    reputationToPostImages: null,
    bindNavPrevention: true,
    postfix: "",
    imageUploader: {
    brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
    contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
    allowUrls: true
    },
    onDemand: true,
    discardSelector: ".discard-answer"
    ,immediatelyShowMarkdownHelp:true
    });


    }
    });














     

    draft saved


    draft discarded


















    StackExchange.ready(
    function () {
    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f207967%2fmodulating-an-oscillator-by-gain-pitch-and-offset%23new-answer', 'question_page');
    }
    );

    Post as a guest















    Required, but never shown

























    4 Answers
    4






    active

    oldest

    votes








    4 Answers
    4






    active

    oldest

    votes









    active

    oldest

    votes






    active

    oldest

    votes








    up vote
    5
    down vote














    • Don't define pi and twopi yourself. They should be available from math.h as M_PI and M_2_PI if you're in the GNU stack.

    • Defining functions as inline is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.


    • GetMin() should be GetMin() const. Same for GetRange, GetProcessedVoiceValue, and anything that doesn't modify a class member.


    • while (phase >= twopi) { phase -= twopi; } - This should not be a loop. It can be done in O(1), something like phase = fmod(phase, twopi) - but double-check this. For very small values of phase, fmod may actually be slower. You can also try phase -= twopi*int(phase/twopi) which is also O(1).

    • On this line:


    double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];



    You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:



    double *pModVoiceValues = pModValues + voiceIndex*bufferSize;






    share|improve this answer























    • M_PI and M_2_PI are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1), for example.
      – Toby Speight
      2 days ago










    • @TobySpeight Thanks; I guess those are Gnu-specific.
      – Reinderien
      2 days ago












    • @Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
      – markzzz
      2 days ago










    • @markzzz The fmod will improve program speed, if only imperceptibly.
      – Reinderien
      2 days ago










    • curious: fmod slow down the process here :O
      – markzzz
      2 days ago















    up vote
    5
    down vote














    • Don't define pi and twopi yourself. They should be available from math.h as M_PI and M_2_PI if you're in the GNU stack.

    • Defining functions as inline is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.


    • GetMin() should be GetMin() const. Same for GetRange, GetProcessedVoiceValue, and anything that doesn't modify a class member.


    • while (phase >= twopi) { phase -= twopi; } - This should not be a loop. It can be done in O(1), something like phase = fmod(phase, twopi) - but double-check this. For very small values of phase, fmod may actually be slower. You can also try phase -= twopi*int(phase/twopi) which is also O(1).

    • On this line:


    double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];



    You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:



    double *pModVoiceValues = pModValues + voiceIndex*bufferSize;






    share|improve this answer























    • M_PI and M_2_PI are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1), for example.
      – Toby Speight
      2 days ago










    • @TobySpeight Thanks; I guess those are Gnu-specific.
      – Reinderien
      2 days ago












    • @Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
      – markzzz
      2 days ago










    • @markzzz The fmod will improve program speed, if only imperceptibly.
      – Reinderien
      2 days ago










    • curious: fmod slow down the process here :O
      – markzzz
      2 days ago













    up vote
    5
    down vote










    up vote
    5
    down vote










    • Don't define pi and twopi yourself. They should be available from math.h as M_PI and M_2_PI if you're in the GNU stack.

    • Defining functions as inline is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.


    • GetMin() should be GetMin() const. Same for GetRange, GetProcessedVoiceValue, and anything that doesn't modify a class member.


    • while (phase >= twopi) { phase -= twopi; } - This should not be a loop. It can be done in O(1), something like phase = fmod(phase, twopi) - but double-check this. For very small values of phase, fmod may actually be slower. You can also try phase -= twopi*int(phase/twopi) which is also O(1).

    • On this line:


    double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];



    You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:



    double *pModVoiceValues = pModValues + voiceIndex*bufferSize;






    share|improve this answer















    • Don't define pi and twopi yourself. They should be available from math.h as M_PI and M_2_PI if you're in the GNU stack.

    • Defining functions as inline is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.


    • GetMin() should be GetMin() const. Same for GetRange, GetProcessedVoiceValue, and anything that doesn't modify a class member.


    • while (phase >= twopi) { phase -= twopi; } - This should not be a loop. It can be done in O(1), something like phase = fmod(phase, twopi) - but double-check this. For very small values of phase, fmod may actually be slower. You can also try phase -= twopi*int(phase/twopi) which is also O(1).

    • On this line:


    double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];



    You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:



    double *pModVoiceValues = pModValues + voiceIndex*bufferSize;







    share|improve this answer














    share|improve this answer



    share|improve this answer








    edited 2 days ago

























    answered 2 days ago









    Reinderien

    1,272516




    1,272516












    • M_PI and M_2_PI are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1), for example.
      – Toby Speight
      2 days ago










    • @TobySpeight Thanks; I guess those are Gnu-specific.
      – Reinderien
      2 days ago












    • @Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
      – markzzz
      2 days ago










    • @markzzz The fmod will improve program speed, if only imperceptibly.
      – Reinderien
      2 days ago










    • curious: fmod slow down the process here :O
      – markzzz
      2 days ago


















    • M_PI and M_2_PI are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1), for example.
      – Toby Speight
      2 days ago










    • @TobySpeight Thanks; I guess those are Gnu-specific.
      – Reinderien
      2 days ago












    • @Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
      – markzzz
      2 days ago










    • @markzzz The fmod will improve program speed, if only imperceptibly.
      – Reinderien
      2 days ago










    • curious: fmod slow down the process here :O
      – markzzz
      2 days ago
















    M_PI and M_2_PI are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1), for example.
    – Toby Speight
    2 days ago




    M_PI and M_2_PI are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1), for example.
    – Toby Speight
    2 days ago












    @TobySpeight Thanks; I guess those are Gnu-specific.
    – Reinderien
    2 days ago






    @TobySpeight Thanks; I guess those are Gnu-specific.
    – Reinderien
    2 days ago














    @Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
    – markzzz
    2 days ago




    @Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
    – markzzz
    2 days ago












    @markzzz The fmod will improve program speed, if only imperceptibly.
    – Reinderien
    2 days ago




    @markzzz The fmod will improve program speed, if only imperceptibly.
    – Reinderien
    2 days ago












    curious: fmod slow down the process here :O
    – markzzz
    2 days ago




    curious: fmod slow down the process here :O
    – markzzz
    2 days ago












    up vote
    5
    down vote













    counting flops



    Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.



    double value = sin(phase) *
    pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

    *left++ += value;
    *right++ += value;

    // next phase
    phase += BOUNDED(
    mRadiansPerSample * (
    bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
    sampleIndex), pitchMin, pitchRange)
    + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
    sampleIndex), offsetMin, offsetRange)
    ),
    0, pi
    );

    while (phase >= twopi) { phase -= twopi; }

    // . . .

    inline double GetOffsetWarped(double normalizedValue, double min, double range)
    {
    return min + normalizedValue * range;
    }
    inline double GetPitchWarped(double normalizedValue, double min, double range)
    {
    return exp((min + normalizedValue * range) * ln2per12);
    }


    Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12), where omega_C = TAU*440*2**(-2 + 3/12) is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:



    # code sample 0

    # Rescale user inputs.
    pitches = pitch_min + pitches_raw*pitch_range
    omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
    gains = gain_min + gains_raw*gain_range

    omegas = omega_C * 2**(pitches/12) + omega_offsets
    values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)


    That is basically the entire program. First, the constants.



    dt is the time in seconds between samples. dt = 1/sample_rate, sample_rate = 44100. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.



    phase0 is the oscillator's phase at the beginning of the block you are currently processing.



    gains, pitches, and omega_offsets are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains ranges from 0 to 1, pitches ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.



    You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.



    Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.



    In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.



    small loops



    Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.



    vectorising



    You're doing the same operations blockSize times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.



    #include <mkl.h>

    // determines the size of a static array.
    #define SASIZE(sa) (sizeof(sa)/sizeof(*sa))

    // . . .

    float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
    float sin_phases[SASIZE(phases)];
    vsSin(SASIZE(phases), phases, sin_phases);
    // Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
    // all at once, about 4 or 8 at a time using simd instructions.


    float



    This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.



    integral phase



    You can store and manipulate phase using uint32_ts. It is naturally modular with modulus 2**32, so you never need to do any fmods or subtractions, especially repeated subtractions in a loop.



                while (phase >= twopi) { phase -= twopi; }  // idk about this...


    That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.



    #include <cstdio>  // Hey, I like *printf().
    #include <cinttypes>

    // These are actually templated C++ math functions. The c doesn't
    // stand for C, I guess...
    #include <cmath>

    using namespace std;

    #define TAU 6.283185307179586


    // fmodf, but always rounding toward negative infinity.
    // The output will always be in [0, y). This is not the case
    // with fmod(). See man fmod.
    float fmodf_rd(float x, float y)
    {
    float r = fmod(x, y);
    return r < 0.f? r + y : r;
    }
    uint32_t integral_phase(float phi)
    {
    return phi / (float)TAU * 0x1p32f;
    }
    float float_phase(uint32_t phi_i)
    {
    return (float)phi_i * (float)TAU * 0x1p-32f;
    }

    int main(int argc, const char *argv)
    {
    // Phases, in radians.
    float alpha = TAU/12., beta = atan(2.f);
    uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);

    // Phase calculations in radians and floating point:
    float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
    // The same phase calculation in integral phase. Note there is no
    // mod operation at all.
    uint32_t gamma_i = 5*alpha_i - 7*beta_i;


    printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
    return 0;
    }


    Epilogue: An M. Night Shyamalan twist.



    This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono badness and replaced it with dtime(), a wrapper I wrote around clock_gettime(). dtime() returns the current value of CLOCK_MONOTONIC in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:



    --- mod.cpp-    2018-11-20 03:19:11.091296221 -0500
    +++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
    @@ -1,7 +1,21 @@
    -#include <iostream>
    +#include <stdio.h>
    +#include <time.h>
    +#include <assert.h>
    +
    #include <cstring>
    #include <cmath>
    -#include <chrono>
    +
    +double timespec_to_d(struct timespec *ts)
    +{
    + return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
    +}
    +double dtime(void)
    +{
    + struct timespec ts;
    + int res = clock_gettime(CLOCK_MONOTONIC, &ts);
    + assert(res == 0);
    + return timespec_to_d(&ts);
    +}

    const int voiceSize = 16;
    const int bufferSize = 256;
    @@ -154,7 +168,7 @@
    // audio host call
    int numProcessing = 1024 * 50;
    int counterProcessing = 0;
    - std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
    + double t0 = dtime();
    while (counterProcessing++ < numProcessing) {
    int blockSize = 256;
    myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
    @@ -162,6 +176,8 @@
    // do somethings with buffer

    }
    - std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
    - std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
    + double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
    + dt_total = 1./sampleRate;
    + printf("CPU = %.6g %%n", dt_busy/dt_total);
    + return 0;
    }


    dt_busy is the total amount of time your code takes to process a single sample. dt_total, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.



    aaaaaaand this is the output:



    % g++ -o mod mod.cpp
    % ./mod
    CPU = 0.105791 %
    %


    When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.






    share|improve this answer










    New contributor




    enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.


















    • Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
      – markzzz
      yesterday










    • Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
      – markzzz
      yesterday










    • Yeah, when set to zero. For every other value, though, lol
      – enigmaticPhysicist
      yesterday










    • for electronic music and "kickdrum synth" is a good tweak :)
      – markzzz
      yesterday










    • also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
      – markzzz
      yesterday















    up vote
    5
    down vote













    counting flops



    Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.



    double value = sin(phase) *
    pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

    *left++ += value;
    *right++ += value;

    // next phase
    phase += BOUNDED(
    mRadiansPerSample * (
    bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
    sampleIndex), pitchMin, pitchRange)
    + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
    sampleIndex), offsetMin, offsetRange)
    ),
    0, pi
    );

    while (phase >= twopi) { phase -= twopi; }

    // . . .

    inline double GetOffsetWarped(double normalizedValue, double min, double range)
    {
    return min + normalizedValue * range;
    }
    inline double GetPitchWarped(double normalizedValue, double min, double range)
    {
    return exp((min + normalizedValue * range) * ln2per12);
    }


    Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12), where omega_C = TAU*440*2**(-2 + 3/12) is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:



    # code sample 0

    # Rescale user inputs.
    pitches = pitch_min + pitches_raw*pitch_range
    omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
    gains = gain_min + gains_raw*gain_range

    omegas = omega_C * 2**(pitches/12) + omega_offsets
    values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)


    That is basically the entire program. First, the constants.



    dt is the time in seconds between samples. dt = 1/sample_rate, sample_rate = 44100. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.



    phase0 is the oscillator's phase at the beginning of the block you are currently processing.



    gains, pitches, and omega_offsets are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains ranges from 0 to 1, pitches ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.



    You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.



    Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.



    In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.



    small loops



    Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.



    vectorising



    You're doing the same operations blockSize times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.



    #include <mkl.h>

    // determines the size of a static array.
    #define SASIZE(sa) (sizeof(sa)/sizeof(*sa))

    // . . .

    float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
    float sin_phases[SASIZE(phases)];
    vsSin(SASIZE(phases), phases, sin_phases);
    // Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
    // all at once, about 4 or 8 at a time using simd instructions.


    float



    This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.



    integral phase



    You can store and manipulate phase using uint32_ts. It is naturally modular with modulus 2**32, so you never need to do any fmods or subtractions, especially repeated subtractions in a loop.



                while (phase >= twopi) { phase -= twopi; }  // idk about this...


    That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.



    #include <cstdio>  // Hey, I like *printf().
    #include <cinttypes>

    // These are actually templated C++ math functions. The c doesn't
    // stand for C, I guess...
    #include <cmath>

    using namespace std;

    #define TAU 6.283185307179586


    // fmodf, but always rounding toward negative infinity.
    // The output will always be in [0, y). This is not the case
    // with fmod(). See man fmod.
    float fmodf_rd(float x, float y)
    {
    float r = fmod(x, y);
    return r < 0.f? r + y : r;
    }
    uint32_t integral_phase(float phi)
    {
    return phi / (float)TAU * 0x1p32f;
    }
    float float_phase(uint32_t phi_i)
    {
    return (float)phi_i * (float)TAU * 0x1p-32f;
    }

    int main(int argc, const char *argv)
    {
    // Phases, in radians.
    float alpha = TAU/12., beta = atan(2.f);
    uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);

    // Phase calculations in radians and floating point:
    float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
    // The same phase calculation in integral phase. Note there is no
    // mod operation at all.
    uint32_t gamma_i = 5*alpha_i - 7*beta_i;


    printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
    return 0;
    }


    Epilogue: An M. Night Shyamalan twist.



    This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono badness and replaced it with dtime(), a wrapper I wrote around clock_gettime(). dtime() returns the current value of CLOCK_MONOTONIC in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:



    --- mod.cpp-    2018-11-20 03:19:11.091296221 -0500
    +++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
    @@ -1,7 +1,21 @@
    -#include <iostream>
    +#include <stdio.h>
    +#include <time.h>
    +#include <assert.h>
    +
    #include <cstring>
    #include <cmath>
    -#include <chrono>
    +
    +double timespec_to_d(struct timespec *ts)
    +{
    + return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
    +}
    +double dtime(void)
    +{
    + struct timespec ts;
    + int res = clock_gettime(CLOCK_MONOTONIC, &ts);
    + assert(res == 0);
    + return timespec_to_d(&ts);
    +}

    const int voiceSize = 16;
    const int bufferSize = 256;
    @@ -154,7 +168,7 @@
    // audio host call
    int numProcessing = 1024 * 50;
    int counterProcessing = 0;
    - std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
    + double t0 = dtime();
    while (counterProcessing++ < numProcessing) {
    int blockSize = 256;
    myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
    @@ -162,6 +176,8 @@
    // do somethings with buffer

    }
    - std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
    - std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
    + double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
    + dt_total = 1./sampleRate;
    + printf("CPU = %.6g %%n", dt_busy/dt_total);
    + return 0;
    }


    dt_busy is the total amount of time your code takes to process a single sample. dt_total, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.



    aaaaaaand this is the output:



    % g++ -o mod mod.cpp
    % ./mod
    CPU = 0.105791 %
    %


    When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.






    share|improve this answer










    New contributor




    enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.


















    • Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
      – markzzz
      yesterday










    • Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
      – markzzz
      yesterday










    • Yeah, when set to zero. For every other value, though, lol
      – enigmaticPhysicist
      yesterday










    • for electronic music and "kickdrum synth" is a good tweak :)
      – markzzz
      yesterday










    • also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
      – markzzz
      yesterday













    up vote
    5
    down vote










    up vote
    5
    down vote









    counting flops



    Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.



    double value = sin(phase) *
    pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

    *left++ += value;
    *right++ += value;

    // next phase
    phase += BOUNDED(
    mRadiansPerSample * (
    bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
    sampleIndex), pitchMin, pitchRange)
    + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
    sampleIndex), offsetMin, offsetRange)
    ),
    0, pi
    );

    while (phase >= twopi) { phase -= twopi; }

    // . . .

    inline double GetOffsetWarped(double normalizedValue, double min, double range)
    {
    return min + normalizedValue * range;
    }
    inline double GetPitchWarped(double normalizedValue, double min, double range)
    {
    return exp((min + normalizedValue * range) * ln2per12);
    }


    Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12), where omega_C = TAU*440*2**(-2 + 3/12) is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:



    # code sample 0

    # Rescale user inputs.
    pitches = pitch_min + pitches_raw*pitch_range
    omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
    gains = gain_min + gains_raw*gain_range

    omegas = omega_C * 2**(pitches/12) + omega_offsets
    values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)


    That is basically the entire program. First, the constants.



    dt is the time in seconds between samples. dt = 1/sample_rate, sample_rate = 44100. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.



    phase0 is the oscillator's phase at the beginning of the block you are currently processing.



    gains, pitches, and omega_offsets are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains ranges from 0 to 1, pitches ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.



    You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.



    Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.



    In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.



    small loops



    Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.



    vectorising



    You're doing the same operations blockSize times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.



    #include <mkl.h>

    // determines the size of a static array.
    #define SASIZE(sa) (sizeof(sa)/sizeof(*sa))

    // . . .

    float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
    float sin_phases[SASIZE(phases)];
    vsSin(SASIZE(phases), phases, sin_phases);
    // Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
    // all at once, about 4 or 8 at a time using simd instructions.


    float



    This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.



    integral phase



    You can store and manipulate phase using uint32_ts. It is naturally modular with modulus 2**32, so you never need to do any fmods or subtractions, especially repeated subtractions in a loop.



                while (phase >= twopi) { phase -= twopi; }  // idk about this...


    That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.



    #include <cstdio>  // Hey, I like *printf().
    #include <cinttypes>

    // These are actually templated C++ math functions. The c doesn't
    // stand for C, I guess...
    #include <cmath>

    using namespace std;

    #define TAU 6.283185307179586


    // fmodf, but always rounding toward negative infinity.
    // The output will always be in [0, y). This is not the case
    // with fmod(). See man fmod.
    float fmodf_rd(float x, float y)
    {
    float r = fmod(x, y);
    return r < 0.f? r + y : r;
    }
    uint32_t integral_phase(float phi)
    {
    return phi / (float)TAU * 0x1p32f;
    }
    float float_phase(uint32_t phi_i)
    {
    return (float)phi_i * (float)TAU * 0x1p-32f;
    }

    int main(int argc, const char *argv)
    {
    // Phases, in radians.
    float alpha = TAU/12., beta = atan(2.f);
    uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);

    // Phase calculations in radians and floating point:
    float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
    // The same phase calculation in integral phase. Note there is no
    // mod operation at all.
    uint32_t gamma_i = 5*alpha_i - 7*beta_i;


    printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
    return 0;
    }


    Epilogue: An M. Night Shyamalan twist.



    This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono badness and replaced it with dtime(), a wrapper I wrote around clock_gettime(). dtime() returns the current value of CLOCK_MONOTONIC in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:



    --- mod.cpp-    2018-11-20 03:19:11.091296221 -0500
    +++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
    @@ -1,7 +1,21 @@
    -#include <iostream>
    +#include <stdio.h>
    +#include <time.h>
    +#include <assert.h>
    +
    #include <cstring>
    #include <cmath>
    -#include <chrono>
    +
    +double timespec_to_d(struct timespec *ts)
    +{
    + return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
    +}
    +double dtime(void)
    +{
    + struct timespec ts;
    + int res = clock_gettime(CLOCK_MONOTONIC, &ts);
    + assert(res == 0);
    + return timespec_to_d(&ts);
    +}

    const int voiceSize = 16;
    const int bufferSize = 256;
    @@ -154,7 +168,7 @@
    // audio host call
    int numProcessing = 1024 * 50;
    int counterProcessing = 0;
    - std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
    + double t0 = dtime();
    while (counterProcessing++ < numProcessing) {
    int blockSize = 256;
    myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
    @@ -162,6 +176,8 @@
    // do somethings with buffer

    }
    - std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
    - std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
    + double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
    + dt_total = 1./sampleRate;
    + printf("CPU = %.6g %%n", dt_busy/dt_total);
    + return 0;
    }


    dt_busy is the total amount of time your code takes to process a single sample. dt_total, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.



    aaaaaaand this is the output:



    % g++ -o mod mod.cpp
    % ./mod
    CPU = 0.105791 %
    %


    When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.






    share|improve this answer










    New contributor




    enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.









    counting flops



    Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.



    double value = sin(phase) *
    pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);

    *left++ += value;
    *right++ += value;

    // next phase
    phase += BOUNDED(
    mRadiansPerSample * (
    bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
    sampleIndex), pitchMin, pitchRange)
    + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
    sampleIndex), offsetMin, offsetRange)
    ),
    0, pi
    );

    while (phase >= twopi) { phase -= twopi; }

    // . . .

    inline double GetOffsetWarped(double normalizedValue, double min, double range)
    {
    return min + normalizedValue * range;
    }
    inline double GetPitchWarped(double normalizedValue, double min, double range)
    {
    return exp((min + normalizedValue * range) * ln2per12);
    }


    Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12), where omega_C = TAU*440*2**(-2 + 3/12) is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:



    # code sample 0

    # Rescale user inputs.
    pitches = pitch_min + pitches_raw*pitch_range
    omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
    gains = gain_min + gains_raw*gain_range

    omegas = omega_C * 2**(pitches/12) + omega_offsets
    values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)


    That is basically the entire program. First, the constants.



    dt is the time in seconds between samples. dt = 1/sample_rate, sample_rate = 44100. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.



    phase0 is the oscillator's phase at the beginning of the block you are currently processing.



    gains, pitches, and omega_offsets are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains ranges from 0 to 1, pitches ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.



    You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.



    Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.



    In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.



    small loops



    Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.



    vectorising



    You're doing the same operations blockSize times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.



    #include <mkl.h>

    // determines the size of a static array.
    #define SASIZE(sa) (sizeof(sa)/sizeof(*sa))

    // . . .

    float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
    float sin_phases[SASIZE(phases)];
    vsSin(SASIZE(phases), phases, sin_phases);
    // Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
    // all at once, about 4 or 8 at a time using simd instructions.


    float



    This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.



    integral phase



    You can store and manipulate phase using uint32_ts. It is naturally modular with modulus 2**32, so you never need to do any fmods or subtractions, especially repeated subtractions in a loop.



                while (phase >= twopi) { phase -= twopi; }  // idk about this...


    That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.



    #include <cstdio>  // Hey, I like *printf().
    #include <cinttypes>

    // These are actually templated C++ math functions. The c doesn't
    // stand for C, I guess...
    #include <cmath>

    using namespace std;

    #define TAU 6.283185307179586


    // fmodf, but always rounding toward negative infinity.
    // The output will always be in [0, y). This is not the case
    // with fmod(). See man fmod.
    float fmodf_rd(float x, float y)
    {
    float r = fmod(x, y);
    return r < 0.f? r + y : r;
    }
    uint32_t integral_phase(float phi)
    {
    return phi / (float)TAU * 0x1p32f;
    }
    float float_phase(uint32_t phi_i)
    {
    return (float)phi_i * (float)TAU * 0x1p-32f;
    }

    int main(int argc, const char *argv)
    {
    // Phases, in radians.
    float alpha = TAU/12., beta = atan(2.f);
    uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);

    // Phase calculations in radians and floating point:
    float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
    // The same phase calculation in integral phase. Note there is no
    // mod operation at all.
    uint32_t gamma_i = 5*alpha_i - 7*beta_i;


    printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
    return 0;
    }


    Epilogue: An M. Night Shyamalan twist.



    This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono badness and replaced it with dtime(), a wrapper I wrote around clock_gettime(). dtime() returns the current value of CLOCK_MONOTONIC in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:



    --- mod.cpp-    2018-11-20 03:19:11.091296221 -0500
    +++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
    @@ -1,7 +1,21 @@
    -#include <iostream>
    +#include <stdio.h>
    +#include <time.h>
    +#include <assert.h>
    +
    #include <cstring>
    #include <cmath>
    -#include <chrono>
    +
    +double timespec_to_d(struct timespec *ts)
    +{
    + return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
    +}
    +double dtime(void)
    +{
    + struct timespec ts;
    + int res = clock_gettime(CLOCK_MONOTONIC, &ts);
    + assert(res == 0);
    + return timespec_to_d(&ts);
    +}

    const int voiceSize = 16;
    const int bufferSize = 256;
    @@ -154,7 +168,7 @@
    // audio host call
    int numProcessing = 1024 * 50;
    int counterProcessing = 0;
    - std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
    + double t0 = dtime();
    while (counterProcessing++ < numProcessing) {
    int blockSize = 256;
    myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
    @@ -162,6 +176,8 @@
    // do somethings with buffer

    }
    - std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
    - std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
    + double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
    + dt_total = 1./sampleRate;
    + printf("CPU = %.6g %%n", dt_busy/dt_total);
    + return 0;
    }


    dt_busy is the total amount of time your code takes to process a single sample. dt_total, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.



    aaaaaaand this is the output:



    % g++ -o mod mod.cpp
    % ./mod
    CPU = 0.105791 %
    %


    When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.







    share|improve this answer










    New contributor




    enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.









    share|improve this answer



    share|improve this answer








    edited yesterday





















    New contributor




    enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.









    answered 2 days ago









    enigmaticPhysicist

    1513




    1513




    New contributor




    enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.





    New contributor





    enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.






    enigmaticPhysicist is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
    Check out our Code of Conduct.












    • Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
      – markzzz
      yesterday










    • Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
      – markzzz
      yesterday










    • Yeah, when set to zero. For every other value, though, lol
      – enigmaticPhysicist
      yesterday










    • for electronic music and "kickdrum synth" is a good tweak :)
      – markzzz
      yesterday










    • also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
      – markzzz
      yesterday


















    • Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
      – markzzz
      yesterday










    • Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
      – markzzz
      yesterday










    • Yeah, when set to zero. For every other value, though, lol
      – enigmaticPhysicist
      yesterday










    • for electronic music and "kickdrum synth" is a good tweak :)
      – markzzz
      yesterday










    • also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
      – markzzz
      yesterday
















    Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
    – markzzz
    yesterday




    Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
    – markzzz
    yesterday












    Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
    – markzzz
    yesterday




    Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
    – markzzz
    yesterday












    Yeah, when set to zero. For every other value, though, lol
    – enigmaticPhysicist
    yesterday




    Yeah, when set to zero. For every other value, though, lol
    – enigmaticPhysicist
    yesterday












    for electronic music and "kickdrum synth" is a good tweak :)
    – markzzz
    yesterday




    for electronic music and "kickdrum synth" is a good tweak :)
    – markzzz
    yesterday












    also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
    – markzzz
    yesterday




    also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
    – markzzz
    yesterday










    up vote
    4
    down vote













    General overview



    I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.



    The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue() doesn't indicate what kind of processing was done; ln2per12 only captures how it was computed, rather than what it's for, and MyPlugin looks like a generic temporary name.



    Specific issues



    BOUNDED() is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp() or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.



    Variable-length arrays are not standard C++. Fix this by making voiceSize and bufferSize compile-time constants, aka constexpr (rather than plain const).



    Prefer initialisation to assignment. In Param, for example, we could make mMin, mMax and mRange const if we initialize:



        Param(double min, double max)
    : mMin{min},
    // mMax was never used, so don't store it
    mRange{max - min}
    {
    }

    // make them public, so we don't need `get_min()` etc.
    const double mMin, mRange;


    The uninitialised pModValues pointer is especially dangerous here.



    I don't understand why we're updating left and right identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy() to duplicate it afterwards.



    In Oscillator, there's no need for pGain, pOffset and pPitch to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc) by making them simple member variables - don't write Java code!



    Param pGain, pOffset, pPitch;

    Oscillator()
    : pGain{0.0, 1.0},
    pOffset{-900.0, 900.0},
    pPitch{-48.0, 48.0}
    { /* ... */ }

    // destructor is now defaulted


    Some functions from <cmath> wrongly accessed in the global namespace (namely std::sin() and std::exp()); also std::memset() from <cstring>. These should be correctly qualified for portable code.



    Prefer std::fill() or std::fill_n() over std::memset() - the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:



        std::fill_n(bufferLeft, blockSize, 0);
    std::fill_n(bufferRight, blockSize, 0);


    Declare main() with no arguments when not intending to process command-line options.



    If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native to my compilation command).






    share|improve this answer























    • BOUNDED: What do you mean "evalutate its arguments more than once"?
      – markzzz
      2 days ago










    • BOUNDED expands x three times. That's misleading if x is an expression with side-effects (e.g. *p++) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
      – Toby Speight
      2 days ago












    • I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
      – markzzz
      2 days ago










    • The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
      – markzzz
      2 days ago

















    up vote
    4
    down vote













    General overview



    I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.



    The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue() doesn't indicate what kind of processing was done; ln2per12 only captures how it was computed, rather than what it's for, and MyPlugin looks like a generic temporary name.



    Specific issues



    BOUNDED() is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp() or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.



    Variable-length arrays are not standard C++. Fix this by making voiceSize and bufferSize compile-time constants, aka constexpr (rather than plain const).



    Prefer initialisation to assignment. In Param, for example, we could make mMin, mMax and mRange const if we initialize:



        Param(double min, double max)
    : mMin{min},
    // mMax was never used, so don't store it
    mRange{max - min}
    {
    }

    // make them public, so we don't need `get_min()` etc.
    const double mMin, mRange;


    The uninitialised pModValues pointer is especially dangerous here.



    I don't understand why we're updating left and right identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy() to duplicate it afterwards.



    In Oscillator, there's no need for pGain, pOffset and pPitch to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc) by making them simple member variables - don't write Java code!



    Param pGain, pOffset, pPitch;

    Oscillator()
    : pGain{0.0, 1.0},
    pOffset{-900.0, 900.0},
    pPitch{-48.0, 48.0}
    { /* ... */ }

    // destructor is now defaulted


    Some functions from <cmath> wrongly accessed in the global namespace (namely std::sin() and std::exp()); also std::memset() from <cstring>. These should be correctly qualified for portable code.



    Prefer std::fill() or std::fill_n() over std::memset() - the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:



        std::fill_n(bufferLeft, blockSize, 0);
    std::fill_n(bufferRight, blockSize, 0);


    Declare main() with no arguments when not intending to process command-line options.



    If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native to my compilation command).






    share|improve this answer























    • BOUNDED: What do you mean "evalutate its arguments more than once"?
      – markzzz
      2 days ago










    • BOUNDED expands x three times. That's misleading if x is an expression with side-effects (e.g. *p++) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
      – Toby Speight
      2 days ago












    • I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
      – markzzz
      2 days ago










    • The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
      – markzzz
      2 days ago















    up vote
    4
    down vote










    up vote
    4
    down vote









    General overview



    I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.



    The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue() doesn't indicate what kind of processing was done; ln2per12 only captures how it was computed, rather than what it's for, and MyPlugin looks like a generic temporary name.



    Specific issues



    BOUNDED() is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp() or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.



    Variable-length arrays are not standard C++. Fix this by making voiceSize and bufferSize compile-time constants, aka constexpr (rather than plain const).



    Prefer initialisation to assignment. In Param, for example, we could make mMin, mMax and mRange const if we initialize:



        Param(double min, double max)
    : mMin{min},
    // mMax was never used, so don't store it
    mRange{max - min}
    {
    }

    // make them public, so we don't need `get_min()` etc.
    const double mMin, mRange;


    The uninitialised pModValues pointer is especially dangerous here.



    I don't understand why we're updating left and right identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy() to duplicate it afterwards.



    In Oscillator, there's no need for pGain, pOffset and pPitch to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc) by making them simple member variables - don't write Java code!



    Param pGain, pOffset, pPitch;

    Oscillator()
    : pGain{0.0, 1.0},
    pOffset{-900.0, 900.0},
    pPitch{-48.0, 48.0}
    { /* ... */ }

    // destructor is now defaulted


    Some functions from <cmath> wrongly accessed in the global namespace (namely std::sin() and std::exp()); also std::memset() from <cstring>. These should be correctly qualified for portable code.



    Prefer std::fill() or std::fill_n() over std::memset() - the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:



        std::fill_n(bufferLeft, blockSize, 0);
    std::fill_n(bufferRight, blockSize, 0);


    Declare main() with no arguments when not intending to process command-line options.



    If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native to my compilation command).






    share|improve this answer














    General overview



    I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.



    The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue() doesn't indicate what kind of processing was done; ln2per12 only captures how it was computed, rather than what it's for, and MyPlugin looks like a generic temporary name.



    Specific issues



    BOUNDED() is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp() or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.



    Variable-length arrays are not standard C++. Fix this by making voiceSize and bufferSize compile-time constants, aka constexpr (rather than plain const).



    Prefer initialisation to assignment. In Param, for example, we could make mMin, mMax and mRange const if we initialize:



        Param(double min, double max)
    : mMin{min},
    // mMax was never used, so don't store it
    mRange{max - min}
    {
    }

    // make them public, so we don't need `get_min()` etc.
    const double mMin, mRange;


    The uninitialised pModValues pointer is especially dangerous here.



    I don't understand why we're updating left and right identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy() to duplicate it afterwards.



    In Oscillator, there's no need for pGain, pOffset and pPitch to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc) by making them simple member variables - don't write Java code!



    Param pGain, pOffset, pPitch;

    Oscillator()
    : pGain{0.0, 1.0},
    pOffset{-900.0, 900.0},
    pPitch{-48.0, 48.0}
    { /* ... */ }

    // destructor is now defaulted


    Some functions from <cmath> wrongly accessed in the global namespace (namely std::sin() and std::exp()); also std::memset() from <cstring>. These should be correctly qualified for portable code.



    Prefer std::fill() or std::fill_n() over std::memset() - the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:



        std::fill_n(bufferLeft, blockSize, 0);
    std::fill_n(bufferRight, blockSize, 0);


    Declare main() with no arguments when not intending to process command-line options.



    If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native to my compilation command).







    share|improve this answer














    share|improve this answer



    share|improve this answer








    edited 2 days ago

























    answered 2 days ago









    Toby Speight

    22.2k536108




    22.2k536108












    • BOUNDED: What do you mean "evalutate its arguments more than once"?
      – markzzz
      2 days ago










    • BOUNDED expands x three times. That's misleading if x is an expression with side-effects (e.g. *p++) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
      – Toby Speight
      2 days ago












    • I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
      – markzzz
      2 days ago










    • The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
      – markzzz
      2 days ago




















    • BOUNDED: What do you mean "evalutate its arguments more than once"?
      – markzzz
      2 days ago










    • BOUNDED expands x three times. That's misleading if x is an expression with side-effects (e.g. *p++) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
      – Toby Speight
      2 days ago












    • I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
      – markzzz
      2 days ago










    • The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
      – markzzz
      2 days ago


















    BOUNDED: What do you mean "evalutate its arguments more than once"?
    – markzzz
    2 days ago




    BOUNDED: What do you mean "evalutate its arguments more than once"?
    – markzzz
    2 days ago












    BOUNDED expands x three times. That's misleading if x is an expression with side-effects (e.g. *p++) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
    – Toby Speight
    2 days ago






    BOUNDED expands x three times. That's misleading if x is an expression with side-effects (e.g. *p++) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
    – Toby Speight
    2 days ago














    I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
    – markzzz
    2 days ago




    I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
    – markzzz
    2 days ago












    The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
    – markzzz
    2 days ago






    The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
    – markzzz
    2 days ago












    up vote
    1
    down vote













    I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.



    $$12 log_2 frac {130.81278} {55} approx 15$$



    $$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$



    $$f approx 130.812,782,650,299...$$



    In C,



    const double noteFrequency = 110*pow(2, 0.25);





    share|improve this answer



























      up vote
      1
      down vote













      I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.



      $$12 log_2 frac {130.81278} {55} approx 15$$



      $$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$



      $$f approx 130.812,782,650,299...$$



      In C,



      const double noteFrequency = 110*pow(2, 0.25);





      share|improve this answer

























        up vote
        1
        down vote










        up vote
        1
        down vote









        I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.



        $$12 log_2 frac {130.81278} {55} approx 15$$



        $$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$



        $$f approx 130.812,782,650,299...$$



        In C,



        const double noteFrequency = 110*pow(2, 0.25);





        share|improve this answer














        I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.



        $$12 log_2 frac {130.81278} {55} approx 15$$



        $$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$



        $$f approx 130.812,782,650,299...$$



        In C,



        const double noteFrequency = 110*pow(2, 0.25);






        share|improve this answer














        share|improve this answer



        share|improve this answer








        edited 2 days ago

























        answered 2 days ago









        Reinderien

        1,272516




        1,272516






























             

            draft saved


            draft discarded



















































             


            draft saved


            draft discarded














            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f207967%2fmodulating-an-oscillator-by-gain-pitch-and-offset%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown





















































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown

































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown







            Popular posts from this blog

            Quarter-circle Tiles

            build a pushdown automaton that recognizes the reverse language of a given pushdown automaton?

            Mont Emei