Modulating an oscillator by gain, pitch, and offset
up vote
1
down vote
favorite
I've an Audio Host application that sends block of samples to a DLL I'm building. It sends 44100 samples per seconds (i.e. Sample Rate 44100hz), per block, processing 16 distinct Voices.
It processes an Oscillator
, which plays a basic sine wave, modulated at Audio Rate by Gain, Pitch and Offset.
Here's the C++ code I have (I'm on MSVC, Release/x86 Configuration with /02 (/Ot)
optimized settings):
#include <iostream>
#include <cstring>
#include <cmath>
#include <chrono>
const int voiceSize = 16;
const int bufferSize = 256;
const double pi = 3.141592653589793238;
const double twopi = 2 * pi;
double sampleRate = 44100.0;
double noteFrequency = 130.81278;
double hostPitch = 1.0;
#define BOUNDED(x,lo,hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
class Param
{
public:
int mControlRate = 1;
double mValue = 0.5;
double mProcessedValues[voiceSize][bufferSize];
double *pModValues;
Param(double min, double max) {
mMin = min;
mMax = max;
mRange = max - min;
}
inline double GetMin() { return mMin; }
inline double GetRange() { return mRange; }
inline double GetProcessedVoiceValue(int voiceIndex, int sampleIndex) { return mProcessedVoicesValues[voiceIndex][sampleIndex]; }
inline void AddModulation(int voiceIndex, int blockSize) {
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
double *pProcessedValues = mProcessedValues[voiceIndex];
int i = 0;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += mControlRate, i++) {
pProcessedValues[i] = BOUNDED(mValue + pModVoiceValues[sampleIndex], 0.0, 1.0);
}
}
private:
double mMin, mMax, mRange;
double mProcessedVoicesValues[voiceSize][bufferSize];
};
class Oscillator
{
public:
double mRadiansPerSample = twopi / sampleRate;
double ln2per12 = std::log(2.0) / 12.0;
double mPhase[voiceSize];
Param *pGain, *pOffset, *pPitch;
Oscillator() {
pGain = new Param(0.0, 1.0);
pOffset = new Param(-900.0, 900.0);
pPitch = new Param(-48.0, 48.0);
// reset osc phase (start at 0.0)
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
Reset(voiceIndex);
}
}
~Oscillator() {
delete pGain;
delete pOffset;
delete pPitch;
}
void Reset(int voiceIndex) {
mPhase[voiceIndex] = 0.0;
}
void ProcessVoiceBlock(int voiceIndex, int blockSize, double noteFrequency, double *left, double *right) {
// local copy
double phase = mPhase[voiceIndex];
double offsetMin = pOffset->GetMin();
double offsetRange = pOffset->GetRange();
double pitchMin = pPitch->GetMin();
double pitchRange = pPitch->GetRange();
// precomputed data
double bp0 = noteFrequency * hostPitch;
// process block values
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
double value = (sin(phase)) * pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(mRadiansPerSample * (bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex, sampleIndex), pitchMin, pitchRange) + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex, sampleIndex), offsetMin, offsetRange)), 0, pi);
while (phase >= twopi) { phase -= twopi; }
}
// revert local copy
mPhase[voiceIndex] = phase;
}
inline double GetOffsetWarped(double normalizedValue, double min, double range) { return min + normalizedValue * range; }
inline double GetPitchWarped(double normalizedValue, double min, double range) { return exp((min + normalizedValue * range) * ln2per12); }
};
class MyPlugin
{
public:
double gainModValues[voiceSize][bufferSize];
double offsetModValues[voiceSize][bufferSize];
double pitchModValues[voiceSize][bufferSize];
Oscillator oscillator;
MyPlugin() {
// link mod arrays to params
oscillator.pGain->pModValues = gainModValues[0];
oscillator.pOffset->pModValues = offsetModValues[0];
oscillator.pPitch->pModValues = pitchModValues[0];
// some fancy data for mod
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
gainModValues[voiceIndex][sampleIndex] = sampleIndex / (double)bufferSize;
}
}
}
void ProcessDoubleReplace(int blockSize, double *bufferLeft, double *bufferRight) {
// init buffer
memset(bufferLeft, 0, blockSize * sizeof(double));
memset(bufferRight, 0, blockSize * sizeof(double));
// voices
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
// envelopes - here's where mod values will change, at audio rate
// add mod to params
oscillator.pGain->AddModulation(voiceIndex, blockSize);
oscillator.pOffset->AddModulation(voiceIndex, blockSize);
oscillator.pPitch->AddModulation(voiceIndex, blockSize);
// osc buffer
oscillator.ProcessVoiceBlock(voiceIndex, blockSize, noteFrequency, bufferLeft, bufferRight);
}
}
};
int main(int argc, const char *argv) {
double bufferLeft[bufferSize];
double bufferRight[bufferSize];
MyPlugin myPlugin;
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
// do somethings with buffer
}
std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
}
Considering I'm running 16 Voices contemporaneously, it takes an huge amount of CPU for a simple Gain/Pitch/Offset modulation.
I hope I can do it better. Any tips/suggestions? Vectorizing?
Note: I could use std::clamp
and C++17, but it doesn't matter a lot here: nothing really change with that expedient.
c++ performance audio signal-processing
add a comment |
up vote
1
down vote
favorite
I've an Audio Host application that sends block of samples to a DLL I'm building. It sends 44100 samples per seconds (i.e. Sample Rate 44100hz), per block, processing 16 distinct Voices.
It processes an Oscillator
, which plays a basic sine wave, modulated at Audio Rate by Gain, Pitch and Offset.
Here's the C++ code I have (I'm on MSVC, Release/x86 Configuration with /02 (/Ot)
optimized settings):
#include <iostream>
#include <cstring>
#include <cmath>
#include <chrono>
const int voiceSize = 16;
const int bufferSize = 256;
const double pi = 3.141592653589793238;
const double twopi = 2 * pi;
double sampleRate = 44100.0;
double noteFrequency = 130.81278;
double hostPitch = 1.0;
#define BOUNDED(x,lo,hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
class Param
{
public:
int mControlRate = 1;
double mValue = 0.5;
double mProcessedValues[voiceSize][bufferSize];
double *pModValues;
Param(double min, double max) {
mMin = min;
mMax = max;
mRange = max - min;
}
inline double GetMin() { return mMin; }
inline double GetRange() { return mRange; }
inline double GetProcessedVoiceValue(int voiceIndex, int sampleIndex) { return mProcessedVoicesValues[voiceIndex][sampleIndex]; }
inline void AddModulation(int voiceIndex, int blockSize) {
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
double *pProcessedValues = mProcessedValues[voiceIndex];
int i = 0;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += mControlRate, i++) {
pProcessedValues[i] = BOUNDED(mValue + pModVoiceValues[sampleIndex], 0.0, 1.0);
}
}
private:
double mMin, mMax, mRange;
double mProcessedVoicesValues[voiceSize][bufferSize];
};
class Oscillator
{
public:
double mRadiansPerSample = twopi / sampleRate;
double ln2per12 = std::log(2.0) / 12.0;
double mPhase[voiceSize];
Param *pGain, *pOffset, *pPitch;
Oscillator() {
pGain = new Param(0.0, 1.0);
pOffset = new Param(-900.0, 900.0);
pPitch = new Param(-48.0, 48.0);
// reset osc phase (start at 0.0)
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
Reset(voiceIndex);
}
}
~Oscillator() {
delete pGain;
delete pOffset;
delete pPitch;
}
void Reset(int voiceIndex) {
mPhase[voiceIndex] = 0.0;
}
void ProcessVoiceBlock(int voiceIndex, int blockSize, double noteFrequency, double *left, double *right) {
// local copy
double phase = mPhase[voiceIndex];
double offsetMin = pOffset->GetMin();
double offsetRange = pOffset->GetRange();
double pitchMin = pPitch->GetMin();
double pitchRange = pPitch->GetRange();
// precomputed data
double bp0 = noteFrequency * hostPitch;
// process block values
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
double value = (sin(phase)) * pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(mRadiansPerSample * (bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex, sampleIndex), pitchMin, pitchRange) + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex, sampleIndex), offsetMin, offsetRange)), 0, pi);
while (phase >= twopi) { phase -= twopi; }
}
// revert local copy
mPhase[voiceIndex] = phase;
}
inline double GetOffsetWarped(double normalizedValue, double min, double range) { return min + normalizedValue * range; }
inline double GetPitchWarped(double normalizedValue, double min, double range) { return exp((min + normalizedValue * range) * ln2per12); }
};
class MyPlugin
{
public:
double gainModValues[voiceSize][bufferSize];
double offsetModValues[voiceSize][bufferSize];
double pitchModValues[voiceSize][bufferSize];
Oscillator oscillator;
MyPlugin() {
// link mod arrays to params
oscillator.pGain->pModValues = gainModValues[0];
oscillator.pOffset->pModValues = offsetModValues[0];
oscillator.pPitch->pModValues = pitchModValues[0];
// some fancy data for mod
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
gainModValues[voiceIndex][sampleIndex] = sampleIndex / (double)bufferSize;
}
}
}
void ProcessDoubleReplace(int blockSize, double *bufferLeft, double *bufferRight) {
// init buffer
memset(bufferLeft, 0, blockSize * sizeof(double));
memset(bufferRight, 0, blockSize * sizeof(double));
// voices
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
// envelopes - here's where mod values will change, at audio rate
// add mod to params
oscillator.pGain->AddModulation(voiceIndex, blockSize);
oscillator.pOffset->AddModulation(voiceIndex, blockSize);
oscillator.pPitch->AddModulation(voiceIndex, blockSize);
// osc buffer
oscillator.ProcessVoiceBlock(voiceIndex, blockSize, noteFrequency, bufferLeft, bufferRight);
}
}
};
int main(int argc, const char *argv) {
double bufferLeft[bufferSize];
double bufferRight[bufferSize];
MyPlugin myPlugin;
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
// do somethings with buffer
}
std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
}
Considering I'm running 16 Voices contemporaneously, it takes an huge amount of CPU for a simple Gain/Pitch/Offset modulation.
I hope I can do it better. Any tips/suggestions? Vectorizing?
Note: I could use std::clamp
and C++17, but it doesn't matter a lot here: nothing really change with that expedient.
c++ performance audio signal-processing
1
I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
– Toby Speight
2 days ago
add a comment |
up vote
1
down vote
favorite
up vote
1
down vote
favorite
I've an Audio Host application that sends block of samples to a DLL I'm building. It sends 44100 samples per seconds (i.e. Sample Rate 44100hz), per block, processing 16 distinct Voices.
It processes an Oscillator
, which plays a basic sine wave, modulated at Audio Rate by Gain, Pitch and Offset.
Here's the C++ code I have (I'm on MSVC, Release/x86 Configuration with /02 (/Ot)
optimized settings):
#include <iostream>
#include <cstring>
#include <cmath>
#include <chrono>
const int voiceSize = 16;
const int bufferSize = 256;
const double pi = 3.141592653589793238;
const double twopi = 2 * pi;
double sampleRate = 44100.0;
double noteFrequency = 130.81278;
double hostPitch = 1.0;
#define BOUNDED(x,lo,hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
class Param
{
public:
int mControlRate = 1;
double mValue = 0.5;
double mProcessedValues[voiceSize][bufferSize];
double *pModValues;
Param(double min, double max) {
mMin = min;
mMax = max;
mRange = max - min;
}
inline double GetMin() { return mMin; }
inline double GetRange() { return mRange; }
inline double GetProcessedVoiceValue(int voiceIndex, int sampleIndex) { return mProcessedVoicesValues[voiceIndex][sampleIndex]; }
inline void AddModulation(int voiceIndex, int blockSize) {
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
double *pProcessedValues = mProcessedValues[voiceIndex];
int i = 0;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += mControlRate, i++) {
pProcessedValues[i] = BOUNDED(mValue + pModVoiceValues[sampleIndex], 0.0, 1.0);
}
}
private:
double mMin, mMax, mRange;
double mProcessedVoicesValues[voiceSize][bufferSize];
};
class Oscillator
{
public:
double mRadiansPerSample = twopi / sampleRate;
double ln2per12 = std::log(2.0) / 12.0;
double mPhase[voiceSize];
Param *pGain, *pOffset, *pPitch;
Oscillator() {
pGain = new Param(0.0, 1.0);
pOffset = new Param(-900.0, 900.0);
pPitch = new Param(-48.0, 48.0);
// reset osc phase (start at 0.0)
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
Reset(voiceIndex);
}
}
~Oscillator() {
delete pGain;
delete pOffset;
delete pPitch;
}
void Reset(int voiceIndex) {
mPhase[voiceIndex] = 0.0;
}
void ProcessVoiceBlock(int voiceIndex, int blockSize, double noteFrequency, double *left, double *right) {
// local copy
double phase = mPhase[voiceIndex];
double offsetMin = pOffset->GetMin();
double offsetRange = pOffset->GetRange();
double pitchMin = pPitch->GetMin();
double pitchRange = pPitch->GetRange();
// precomputed data
double bp0 = noteFrequency * hostPitch;
// process block values
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
double value = (sin(phase)) * pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(mRadiansPerSample * (bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex, sampleIndex), pitchMin, pitchRange) + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex, sampleIndex), offsetMin, offsetRange)), 0, pi);
while (phase >= twopi) { phase -= twopi; }
}
// revert local copy
mPhase[voiceIndex] = phase;
}
inline double GetOffsetWarped(double normalizedValue, double min, double range) { return min + normalizedValue * range; }
inline double GetPitchWarped(double normalizedValue, double min, double range) { return exp((min + normalizedValue * range) * ln2per12); }
};
class MyPlugin
{
public:
double gainModValues[voiceSize][bufferSize];
double offsetModValues[voiceSize][bufferSize];
double pitchModValues[voiceSize][bufferSize];
Oscillator oscillator;
MyPlugin() {
// link mod arrays to params
oscillator.pGain->pModValues = gainModValues[0];
oscillator.pOffset->pModValues = offsetModValues[0];
oscillator.pPitch->pModValues = pitchModValues[0];
// some fancy data for mod
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
gainModValues[voiceIndex][sampleIndex] = sampleIndex / (double)bufferSize;
}
}
}
void ProcessDoubleReplace(int blockSize, double *bufferLeft, double *bufferRight) {
// init buffer
memset(bufferLeft, 0, blockSize * sizeof(double));
memset(bufferRight, 0, blockSize * sizeof(double));
// voices
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
// envelopes - here's where mod values will change, at audio rate
// add mod to params
oscillator.pGain->AddModulation(voiceIndex, blockSize);
oscillator.pOffset->AddModulation(voiceIndex, blockSize);
oscillator.pPitch->AddModulation(voiceIndex, blockSize);
// osc buffer
oscillator.ProcessVoiceBlock(voiceIndex, blockSize, noteFrequency, bufferLeft, bufferRight);
}
}
};
int main(int argc, const char *argv) {
double bufferLeft[bufferSize];
double bufferRight[bufferSize];
MyPlugin myPlugin;
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
// do somethings with buffer
}
std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
}
Considering I'm running 16 Voices contemporaneously, it takes an huge amount of CPU for a simple Gain/Pitch/Offset modulation.
I hope I can do it better. Any tips/suggestions? Vectorizing?
Note: I could use std::clamp
and C++17, but it doesn't matter a lot here: nothing really change with that expedient.
c++ performance audio signal-processing
I've an Audio Host application that sends block of samples to a DLL I'm building. It sends 44100 samples per seconds (i.e. Sample Rate 44100hz), per block, processing 16 distinct Voices.
It processes an Oscillator
, which plays a basic sine wave, modulated at Audio Rate by Gain, Pitch and Offset.
Here's the C++ code I have (I'm on MSVC, Release/x86 Configuration with /02 (/Ot)
optimized settings):
#include <iostream>
#include <cstring>
#include <cmath>
#include <chrono>
const int voiceSize = 16;
const int bufferSize = 256;
const double pi = 3.141592653589793238;
const double twopi = 2 * pi;
double sampleRate = 44100.0;
double noteFrequency = 130.81278;
double hostPitch = 1.0;
#define BOUNDED(x,lo,hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
class Param
{
public:
int mControlRate = 1;
double mValue = 0.5;
double mProcessedValues[voiceSize][bufferSize];
double *pModValues;
Param(double min, double max) {
mMin = min;
mMax = max;
mRange = max - min;
}
inline double GetMin() { return mMin; }
inline double GetRange() { return mRange; }
inline double GetProcessedVoiceValue(int voiceIndex, int sampleIndex) { return mProcessedVoicesValues[voiceIndex][sampleIndex]; }
inline void AddModulation(int voiceIndex, int blockSize) {
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
double *pProcessedValues = mProcessedValues[voiceIndex];
int i = 0;
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex += mControlRate, i++) {
pProcessedValues[i] = BOUNDED(mValue + pModVoiceValues[sampleIndex], 0.0, 1.0);
}
}
private:
double mMin, mMax, mRange;
double mProcessedVoicesValues[voiceSize][bufferSize];
};
class Oscillator
{
public:
double mRadiansPerSample = twopi / sampleRate;
double ln2per12 = std::log(2.0) / 12.0;
double mPhase[voiceSize];
Param *pGain, *pOffset, *pPitch;
Oscillator() {
pGain = new Param(0.0, 1.0);
pOffset = new Param(-900.0, 900.0);
pPitch = new Param(-48.0, 48.0);
// reset osc phase (start at 0.0)
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
Reset(voiceIndex);
}
}
~Oscillator() {
delete pGain;
delete pOffset;
delete pPitch;
}
void Reset(int voiceIndex) {
mPhase[voiceIndex] = 0.0;
}
void ProcessVoiceBlock(int voiceIndex, int blockSize, double noteFrequency, double *left, double *right) {
// local copy
double phase = mPhase[voiceIndex];
double offsetMin = pOffset->GetMin();
double offsetRange = pOffset->GetRange();
double pitchMin = pPitch->GetMin();
double pitchRange = pPitch->GetRange();
// precomputed data
double bp0 = noteFrequency * hostPitch;
// process block values
for (int sampleIndex = 0; sampleIndex < blockSize; sampleIndex++) {
double value = (sin(phase)) * pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(mRadiansPerSample * (bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex, sampleIndex), pitchMin, pitchRange) + GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex, sampleIndex), offsetMin, offsetRange)), 0, pi);
while (phase >= twopi) { phase -= twopi; }
}
// revert local copy
mPhase[voiceIndex] = phase;
}
inline double GetOffsetWarped(double normalizedValue, double min, double range) { return min + normalizedValue * range; }
inline double GetPitchWarped(double normalizedValue, double min, double range) { return exp((min + normalizedValue * range) * ln2per12); }
};
class MyPlugin
{
public:
double gainModValues[voiceSize][bufferSize];
double offsetModValues[voiceSize][bufferSize];
double pitchModValues[voiceSize][bufferSize];
Oscillator oscillator;
MyPlugin() {
// link mod arrays to params
oscillator.pGain->pModValues = gainModValues[0];
oscillator.pOffset->pModValues = offsetModValues[0];
oscillator.pPitch->pModValues = pitchModValues[0];
// some fancy data for mod
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
for (int sampleIndex = 0; sampleIndex < bufferSize; sampleIndex++) {
gainModValues[voiceIndex][sampleIndex] = sampleIndex / (double)bufferSize;
}
}
}
void ProcessDoubleReplace(int blockSize, double *bufferLeft, double *bufferRight) {
// init buffer
memset(bufferLeft, 0, blockSize * sizeof(double));
memset(bufferRight, 0, blockSize * sizeof(double));
// voices
for (int voiceIndex = 0; voiceIndex < voiceSize; voiceIndex++) {
// envelopes - here's where mod values will change, at audio rate
// add mod to params
oscillator.pGain->AddModulation(voiceIndex, blockSize);
oscillator.pOffset->AddModulation(voiceIndex, blockSize);
oscillator.pPitch->AddModulation(voiceIndex, blockSize);
// osc buffer
oscillator.ProcessVoiceBlock(voiceIndex, blockSize, noteFrequency, bufferLeft, bufferRight);
}
}
};
int main(int argc, const char *argv) {
double bufferLeft[bufferSize];
double bufferRight[bufferSize];
MyPlugin myPlugin;
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
// do somethings with buffer
}
std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
}
Considering I'm running 16 Voices contemporaneously, it takes an huge amount of CPU for a simple Gain/Pitch/Offset modulation.
I hope I can do it better. Any tips/suggestions? Vectorizing?
Note: I could use std::clamp
and C++17, but it doesn't matter a lot here: nothing really change with that expedient.
c++ performance audio signal-processing
c++ performance audio signal-processing
edited 2 days ago
asked 2 days ago
markzzz
1285
1285
1
I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
– Toby Speight
2 days ago
add a comment |
1
I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
– Toby Speight
2 days ago
1
1
I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
– Toby Speight
2 days ago
I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
– Toby Speight
2 days ago
add a comment |
4 Answers
4
active
oldest
votes
up vote
5
down vote
- Don't define pi and twopi yourself. They should be available from math.h as
M_PI
andM_2_PI
if you're in the GNU stack. - Defining functions as
inline
is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.
GetMin()
should beGetMin() const
. Same forGetRange
,GetProcessedVoiceValue
, and anything that doesn't modify a class member.
while (phase >= twopi) { phase -= twopi; }
- This should not be a loop. It can be done in O(1), something likephase = fmod(phase, twopi)
- but double-check this. For very small values ofphase
,fmod
may actually be slower. You can also tryphase -= twopi*int(phase/twopi)
which is also O(1).- On this line:
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:
double *pModVoiceValues = pModValues + voiceIndex*bufferSize;
M_PI
andM_2_PI
are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values.const auto pi = 4.0 * std::atan(1)
, for example.
– Toby Speight
2 days ago
@TobySpeight Thanks; I guess those are Gnu-specific.
– Reinderien
2 days ago
@Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
– markzzz
2 days ago
@markzzz Thefmod
will improve program speed, if only imperceptibly.
– Reinderien
2 days ago
curious: fmod slow down the process here :O
– markzzz
2 days ago
|
show 5 more comments
up vote
5
down vote
counting flops
Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.
double value = sin(phase) *
pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(
mRadiansPerSample * (
bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
sampleIndex), pitchMin, pitchRange)
+ GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
sampleIndex), offsetMin, offsetRange)
),
0, pi
);
while (phase >= twopi) { phase -= twopi; }
// . . .
inline double GetOffsetWarped(double normalizedValue, double min, double range)
{
return min + normalizedValue * range;
}
inline double GetPitchWarped(double normalizedValue, double min, double range)
{
return exp((min + normalizedValue * range) * ln2per12);
}
Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12)
, where omega_C = TAU*440*2**(-2 + 3/12)
is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:
# code sample 0
# Rescale user inputs.
pitches = pitch_min + pitches_raw*pitch_range
omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
gains = gain_min + gains_raw*gain_range
omegas = omega_C * 2**(pitches/12) + omega_offsets
values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)
That is basically the entire program. First, the constants.
dt
is the time in seconds between samples. dt = 1/sample_rate
, sample_rate = 44100
. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.
phase0
is the oscillator's phase at the beginning of the block you are currently processing.
gains
, pitches
, and omega_offsets
are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains
ranges from 0 to 1, pitches
ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets
ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.
You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.
Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.
In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.
small loops
Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.
vectorising
You're doing the same operations blockSize
times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.
#include <mkl.h>
// determines the size of a static array.
#define SASIZE(sa) (sizeof(sa)/sizeof(*sa))
// . . .
float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
float sin_phases[SASIZE(phases)];
vsSin(SASIZE(phases), phases, sin_phases);
// Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
// all at once, about 4 or 8 at a time using simd instructions.
float
This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.
integral phase
You can store and manipulate phase using uint32_t
s. It is naturally modular with modulus 2**32, so you never need to do any fmod
s or subtractions, especially repeated subtractions in a loop.
while (phase >= twopi) { phase -= twopi; } // idk about this...
That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.
#include <cstdio> // Hey, I like *printf().
#include <cinttypes>
// These are actually templated C++ math functions. The c doesn't
// stand for C, I guess...
#include <cmath>
using namespace std;
#define TAU 6.283185307179586
// fmodf, but always rounding toward negative infinity.
// The output will always be in [0, y). This is not the case
// with fmod(). See man fmod.
float fmodf_rd(float x, float y)
{
float r = fmod(x, y);
return r < 0.f? r + y : r;
}
uint32_t integral_phase(float phi)
{
return phi / (float)TAU * 0x1p32f;
}
float float_phase(uint32_t phi_i)
{
return (float)phi_i * (float)TAU * 0x1p-32f;
}
int main(int argc, const char *argv)
{
// Phases, in radians.
float alpha = TAU/12., beta = atan(2.f);
uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);
// Phase calculations in radians and floating point:
float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
// The same phase calculation in integral phase. Note there is no
// mod operation at all.
uint32_t gamma_i = 5*alpha_i - 7*beta_i;
printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
return 0;
}
Epilogue: An M. Night Shyamalan twist.
This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono
badness and replaced it with dtime(),
a wrapper I wrote around clock_gettime()
. dtime()
returns the current value of CLOCK_MONOTONIC
in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:
--- mod.cpp- 2018-11-20 03:19:11.091296221 -0500
+++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
@@ -1,7 +1,21 @@
-#include <iostream>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
#include <cstring>
#include <cmath>
-#include <chrono>
+
+double timespec_to_d(struct timespec *ts)
+{
+ return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
+}
+double dtime(void)
+{
+ struct timespec ts;
+ int res = clock_gettime(CLOCK_MONOTONIC, &ts);
+ assert(res == 0);
+ return timespec_to_d(&ts);
+}
const int voiceSize = 16;
const int bufferSize = 256;
@@ -154,7 +168,7 @@
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
- std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
+ double t0 = dtime();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
@@ -162,6 +176,8 @@
// do somethings with buffer
}
- std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
- std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
+ double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
+ dt_total = 1./sampleRate;
+ printf("CPU = %.6g %%n", dt_busy/dt_total);
+ return 0;
}
dt_busy
is the total amount of time your code takes to process a single sample. dt_total
, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.
aaaaaaand this is the output:
% g++ -o mod mod.cpp
% ./mod
CPU = 0.105791 %
%
When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.
New contributor
Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
– markzzz
yesterday
Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
– markzzz
yesterday
Yeah, when set to zero. For every other value, though, lol
– enigmaticPhysicist
yesterday
for electronic music and "kickdrum synth" is a good tweak :)
– markzzz
yesterday
also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
– markzzz
yesterday
|
show 3 more comments
up vote
4
down vote
General overview
I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.
The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue()
doesn't indicate what kind of processing was done; ln2per12
only captures how it was computed, rather than what it's for, and MyPlugin
looks like a generic temporary name.
Specific issues
BOUNDED()
is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp()
or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.
Variable-length arrays are not standard C++. Fix this by making voiceSize
and bufferSize
compile-time constants, aka constexpr
(rather than plain const
).
Prefer initialisation to assignment. In Param
, for example, we could make mMin
, mMax
and mRange
const if we initialize:
Param(double min, double max)
: mMin{min},
// mMax was never used, so don't store it
mRange{max - min}
{
}
// make them public, so we don't need `get_min()` etc.
const double mMin, mRange;
The uninitialised pModValues
pointer is especially dangerous here.
I don't understand why we're updating left
and right
identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy()
to duplicate it afterwards.
In Oscillator
, there's no need for pGain
, pOffset
and pPitch
to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc
) by making them simple member variables - don't write Java code!
Param pGain, pOffset, pPitch;
Oscillator()
: pGain{0.0, 1.0},
pOffset{-900.0, 900.0},
pPitch{-48.0, 48.0}
{ /* ... */ }
// destructor is now defaulted
Some functions from <cmath>
wrongly accessed in the global namespace (namely std::sin()
and std::exp()
); also std::memset()
from <cstring>
. These should be correctly qualified for portable code.
Prefer std::fill()
or std::fill_n()
over std::memset()
- the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:
std::fill_n(bufferLeft, blockSize, 0);
std::fill_n(bufferRight, blockSize, 0);
Declare main()
with no arguments when not intending to process command-line options.
If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native
to my compilation command).
BOUNDED: What do you mean "evalutate its arguments more than once"?
– markzzz
2 days ago
BOUNDED
expandsx
three times. That's misleading ifx
is an expression with side-effects (e.g.*p++
) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
– Toby Speight
2 days ago
I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
– markzzz
2 days ago
The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
– markzzz
2 days ago
add a comment |
up vote
1
down vote
I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.
$$12 log_2 frac {130.81278} {55} approx 15$$
$$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$
$$f approx 130.812,782,650,299...$$
In C,
const double noteFrequency = 110*pow(2, 0.25);
add a comment |
4 Answers
4
active
oldest
votes
4 Answers
4
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
5
down vote
- Don't define pi and twopi yourself. They should be available from math.h as
M_PI
andM_2_PI
if you're in the GNU stack. - Defining functions as
inline
is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.
GetMin()
should beGetMin() const
. Same forGetRange
,GetProcessedVoiceValue
, and anything that doesn't modify a class member.
while (phase >= twopi) { phase -= twopi; }
- This should not be a loop. It can be done in O(1), something likephase = fmod(phase, twopi)
- but double-check this. For very small values ofphase
,fmod
may actually be slower. You can also tryphase -= twopi*int(phase/twopi)
which is also O(1).- On this line:
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:
double *pModVoiceValues = pModValues + voiceIndex*bufferSize;
M_PI
andM_2_PI
are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values.const auto pi = 4.0 * std::atan(1)
, for example.
– Toby Speight
2 days ago
@TobySpeight Thanks; I guess those are Gnu-specific.
– Reinderien
2 days ago
@Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
– markzzz
2 days ago
@markzzz Thefmod
will improve program speed, if only imperceptibly.
– Reinderien
2 days ago
curious: fmod slow down the process here :O
– markzzz
2 days ago
|
show 5 more comments
up vote
5
down vote
- Don't define pi and twopi yourself. They should be available from math.h as
M_PI
andM_2_PI
if you're in the GNU stack. - Defining functions as
inline
is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.
GetMin()
should beGetMin() const
. Same forGetRange
,GetProcessedVoiceValue
, and anything that doesn't modify a class member.
while (phase >= twopi) { phase -= twopi; }
- This should not be a loop. It can be done in O(1), something likephase = fmod(phase, twopi)
- but double-check this. For very small values ofphase
,fmod
may actually be slower. You can also tryphase -= twopi*int(phase/twopi)
which is also O(1).- On this line:
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:
double *pModVoiceValues = pModValues + voiceIndex*bufferSize;
M_PI
andM_2_PI
are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values.const auto pi = 4.0 * std::atan(1)
, for example.
– Toby Speight
2 days ago
@TobySpeight Thanks; I guess those are Gnu-specific.
– Reinderien
2 days ago
@Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
– markzzz
2 days ago
@markzzz Thefmod
will improve program speed, if only imperceptibly.
– Reinderien
2 days ago
curious: fmod slow down the process here :O
– markzzz
2 days ago
|
show 5 more comments
up vote
5
down vote
up vote
5
down vote
- Don't define pi and twopi yourself. They should be available from math.h as
M_PI
andM_2_PI
if you're in the GNU stack. - Defining functions as
inline
is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.
GetMin()
should beGetMin() const
. Same forGetRange
,GetProcessedVoiceValue
, and anything that doesn't modify a class member.
while (phase >= twopi) { phase -= twopi; }
- This should not be a loop. It can be done in O(1), something likephase = fmod(phase, twopi)
- but double-check this. For very small values ofphase
,fmod
may actually be slower. You can also tryphase -= twopi*int(phase/twopi)
which is also O(1).- On this line:
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:
double *pModVoiceValues = pModValues + voiceIndex*bufferSize;
- Don't define pi and twopi yourself. They should be available from math.h as
M_PI
andM_2_PI
if you're in the GNU stack. - Defining functions as
inline
is more or less useless. The compiler will do this (or not) as it sees fit, and generally knows better than you on when it's beneficial.
GetMin()
should beGetMin() const
. Same forGetRange
,GetProcessedVoiceValue
, and anything that doesn't modify a class member.
while (phase >= twopi) { phase -= twopi; }
- This should not be a loop. It can be done in O(1), something likephase = fmod(phase, twopi)
- but double-check this. For very small values ofphase
,fmod
may actually be slower. You can also tryphase -= twopi*int(phase/twopi)
which is also O(1).- On this line:
double *pModVoiceValues = &pModValues[voiceIndex * bufferSize];
You're dereferencing and then re-referencing a pointer, which you don't need to do. Simply add:
double *pModVoiceValues = pModValues + voiceIndex*bufferSize;
edited 2 days ago
answered 2 days ago
Reinderien
1,272516
1,272516
M_PI
andM_2_PI
are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values.const auto pi = 4.0 * std::atan(1)
, for example.
– Toby Speight
2 days ago
@TobySpeight Thanks; I guess those are Gnu-specific.
– Reinderien
2 days ago
@Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
– markzzz
2 days ago
@markzzz Thefmod
will improve program speed, if only imperceptibly.
– Reinderien
2 days ago
curious: fmod slow down the process here :O
– markzzz
2 days ago
|
show 5 more comments
M_PI
andM_2_PI
are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values.const auto pi = 4.0 * std::atan(1)
, for example.
– Toby Speight
2 days ago
@TobySpeight Thanks; I guess those are Gnu-specific.
– Reinderien
2 days ago
@Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
– markzzz
2 days ago
@markzzz Thefmod
will improve program speed, if only imperceptibly.
– Reinderien
2 days ago
curious: fmod slow down the process here :O
– markzzz
2 days ago
M_PI
and M_2_PI
are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1)
, for example.– Toby Speight
2 days ago
M_PI
and M_2_PI
are not standard C++ identifiers. That said, your reasoning is correct; we can compute suitable values. const auto pi = 4.0 * std::atan(1)
, for example.– Toby Speight
2 days ago
@TobySpeight Thanks; I guess those are Gnu-specific.
– Reinderien
2 days ago
@TobySpeight Thanks; I guess those are Gnu-specific.
– Reinderien
2 days ago
@Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
– markzzz
2 days ago
@Reinderien; nice suggestions. But unfortunately they don't improve the speed of the program.
– markzzz
2 days ago
@markzzz The
fmod
will improve program speed, if only imperceptibly.– Reinderien
2 days ago
@markzzz The
fmod
will improve program speed, if only imperceptibly.– Reinderien
2 days ago
curious: fmod slow down the process here :O
– markzzz
2 days ago
curious: fmod slow down the process here :O
– markzzz
2 days ago
|
show 5 more comments
up vote
5
down vote
counting flops
Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.
double value = sin(phase) *
pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(
mRadiansPerSample * (
bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
sampleIndex), pitchMin, pitchRange)
+ GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
sampleIndex), offsetMin, offsetRange)
),
0, pi
);
while (phase >= twopi) { phase -= twopi; }
// . . .
inline double GetOffsetWarped(double normalizedValue, double min, double range)
{
return min + normalizedValue * range;
}
inline double GetPitchWarped(double normalizedValue, double min, double range)
{
return exp((min + normalizedValue * range) * ln2per12);
}
Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12)
, where omega_C = TAU*440*2**(-2 + 3/12)
is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:
# code sample 0
# Rescale user inputs.
pitches = pitch_min + pitches_raw*pitch_range
omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
gains = gain_min + gains_raw*gain_range
omegas = omega_C * 2**(pitches/12) + omega_offsets
values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)
That is basically the entire program. First, the constants.
dt
is the time in seconds between samples. dt = 1/sample_rate
, sample_rate = 44100
. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.
phase0
is the oscillator's phase at the beginning of the block you are currently processing.
gains
, pitches
, and omega_offsets
are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains
ranges from 0 to 1, pitches
ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets
ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.
You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.
Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.
In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.
small loops
Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.
vectorising
You're doing the same operations blockSize
times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.
#include <mkl.h>
// determines the size of a static array.
#define SASIZE(sa) (sizeof(sa)/sizeof(*sa))
// . . .
float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
float sin_phases[SASIZE(phases)];
vsSin(SASIZE(phases), phases, sin_phases);
// Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
// all at once, about 4 or 8 at a time using simd instructions.
float
This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.
integral phase
You can store and manipulate phase using uint32_t
s. It is naturally modular with modulus 2**32, so you never need to do any fmod
s or subtractions, especially repeated subtractions in a loop.
while (phase >= twopi) { phase -= twopi; } // idk about this...
That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.
#include <cstdio> // Hey, I like *printf().
#include <cinttypes>
// These are actually templated C++ math functions. The c doesn't
// stand for C, I guess...
#include <cmath>
using namespace std;
#define TAU 6.283185307179586
// fmodf, but always rounding toward negative infinity.
// The output will always be in [0, y). This is not the case
// with fmod(). See man fmod.
float fmodf_rd(float x, float y)
{
float r = fmod(x, y);
return r < 0.f? r + y : r;
}
uint32_t integral_phase(float phi)
{
return phi / (float)TAU * 0x1p32f;
}
float float_phase(uint32_t phi_i)
{
return (float)phi_i * (float)TAU * 0x1p-32f;
}
int main(int argc, const char *argv)
{
// Phases, in radians.
float alpha = TAU/12., beta = atan(2.f);
uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);
// Phase calculations in radians and floating point:
float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
// The same phase calculation in integral phase. Note there is no
// mod operation at all.
uint32_t gamma_i = 5*alpha_i - 7*beta_i;
printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
return 0;
}
Epilogue: An M. Night Shyamalan twist.
This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono
badness and replaced it with dtime(),
a wrapper I wrote around clock_gettime()
. dtime()
returns the current value of CLOCK_MONOTONIC
in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:
--- mod.cpp- 2018-11-20 03:19:11.091296221 -0500
+++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
@@ -1,7 +1,21 @@
-#include <iostream>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
#include <cstring>
#include <cmath>
-#include <chrono>
+
+double timespec_to_d(struct timespec *ts)
+{
+ return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
+}
+double dtime(void)
+{
+ struct timespec ts;
+ int res = clock_gettime(CLOCK_MONOTONIC, &ts);
+ assert(res == 0);
+ return timespec_to_d(&ts);
+}
const int voiceSize = 16;
const int bufferSize = 256;
@@ -154,7 +168,7 @@
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
- std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
+ double t0 = dtime();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
@@ -162,6 +176,8 @@
// do somethings with buffer
}
- std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
- std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
+ double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
+ dt_total = 1./sampleRate;
+ printf("CPU = %.6g %%n", dt_busy/dt_total);
+ return 0;
}
dt_busy
is the total amount of time your code takes to process a single sample. dt_total
, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.
aaaaaaand this is the output:
% g++ -o mod mod.cpp
% ./mod
CPU = 0.105791 %
%
When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.
New contributor
Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
– markzzz
yesterday
Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
– markzzz
yesterday
Yeah, when set to zero. For every other value, though, lol
– enigmaticPhysicist
yesterday
for electronic music and "kickdrum synth" is a good tweak :)
– markzzz
yesterday
also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
– markzzz
yesterday
|
show 3 more comments
up vote
5
down vote
counting flops
Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.
double value = sin(phase) *
pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(
mRadiansPerSample * (
bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
sampleIndex), pitchMin, pitchRange)
+ GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
sampleIndex), offsetMin, offsetRange)
),
0, pi
);
while (phase >= twopi) { phase -= twopi; }
// . . .
inline double GetOffsetWarped(double normalizedValue, double min, double range)
{
return min + normalizedValue * range;
}
inline double GetPitchWarped(double normalizedValue, double min, double range)
{
return exp((min + normalizedValue * range) * ln2per12);
}
Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12)
, where omega_C = TAU*440*2**(-2 + 3/12)
is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:
# code sample 0
# Rescale user inputs.
pitches = pitch_min + pitches_raw*pitch_range
omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
gains = gain_min + gains_raw*gain_range
omegas = omega_C * 2**(pitches/12) + omega_offsets
values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)
That is basically the entire program. First, the constants.
dt
is the time in seconds between samples. dt = 1/sample_rate
, sample_rate = 44100
. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.
phase0
is the oscillator's phase at the beginning of the block you are currently processing.
gains
, pitches
, and omega_offsets
are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains
ranges from 0 to 1, pitches
ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets
ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.
You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.
Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.
In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.
small loops
Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.
vectorising
You're doing the same operations blockSize
times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.
#include <mkl.h>
// determines the size of a static array.
#define SASIZE(sa) (sizeof(sa)/sizeof(*sa))
// . . .
float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
float sin_phases[SASIZE(phases)];
vsSin(SASIZE(phases), phases, sin_phases);
// Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
// all at once, about 4 or 8 at a time using simd instructions.
float
This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.
integral phase
You can store and manipulate phase using uint32_t
s. It is naturally modular with modulus 2**32, so you never need to do any fmod
s or subtractions, especially repeated subtractions in a loop.
while (phase >= twopi) { phase -= twopi; } // idk about this...
That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.
#include <cstdio> // Hey, I like *printf().
#include <cinttypes>
// These are actually templated C++ math functions. The c doesn't
// stand for C, I guess...
#include <cmath>
using namespace std;
#define TAU 6.283185307179586
// fmodf, but always rounding toward negative infinity.
// The output will always be in [0, y). This is not the case
// with fmod(). See man fmod.
float fmodf_rd(float x, float y)
{
float r = fmod(x, y);
return r < 0.f? r + y : r;
}
uint32_t integral_phase(float phi)
{
return phi / (float)TAU * 0x1p32f;
}
float float_phase(uint32_t phi_i)
{
return (float)phi_i * (float)TAU * 0x1p-32f;
}
int main(int argc, const char *argv)
{
// Phases, in radians.
float alpha = TAU/12., beta = atan(2.f);
uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);
// Phase calculations in radians and floating point:
float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
// The same phase calculation in integral phase. Note there is no
// mod operation at all.
uint32_t gamma_i = 5*alpha_i - 7*beta_i;
printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
return 0;
}
Epilogue: An M. Night Shyamalan twist.
This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono
badness and replaced it with dtime(),
a wrapper I wrote around clock_gettime()
. dtime()
returns the current value of CLOCK_MONOTONIC
in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:
--- mod.cpp- 2018-11-20 03:19:11.091296221 -0500
+++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
@@ -1,7 +1,21 @@
-#include <iostream>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
#include <cstring>
#include <cmath>
-#include <chrono>
+
+double timespec_to_d(struct timespec *ts)
+{
+ return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
+}
+double dtime(void)
+{
+ struct timespec ts;
+ int res = clock_gettime(CLOCK_MONOTONIC, &ts);
+ assert(res == 0);
+ return timespec_to_d(&ts);
+}
const int voiceSize = 16;
const int bufferSize = 256;
@@ -154,7 +168,7 @@
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
- std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
+ double t0 = dtime();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
@@ -162,6 +176,8 @@
// do somethings with buffer
}
- std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
- std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
+ double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
+ dt_total = 1./sampleRate;
+ printf("CPU = %.6g %%n", dt_busy/dt_total);
+ return 0;
}
dt_busy
is the total amount of time your code takes to process a single sample. dt_total
, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.
aaaaaaand this is the output:
% g++ -o mod mod.cpp
% ./mod
CPU = 0.105791 %
%
When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.
New contributor
Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
– markzzz
yesterday
Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
– markzzz
yesterday
Yeah, when set to zero. For every other value, though, lol
– enigmaticPhysicist
yesterday
for electronic music and "kickdrum synth" is a good tweak :)
– markzzz
yesterday
also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
– markzzz
yesterday
|
show 3 more comments
up vote
5
down vote
up vote
5
down vote
counting flops
Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.
double value = sin(phase) *
pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(
mRadiansPerSample * (
bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
sampleIndex), pitchMin, pitchRange)
+ GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
sampleIndex), offsetMin, offsetRange)
),
0, pi
);
while (phase >= twopi) { phase -= twopi; }
// . . .
inline double GetOffsetWarped(double normalizedValue, double min, double range)
{
return min + normalizedValue * range;
}
inline double GetPitchWarped(double normalizedValue, double min, double range)
{
return exp((min + normalizedValue * range) * ln2per12);
}
Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12)
, where omega_C = TAU*440*2**(-2 + 3/12)
is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:
# code sample 0
# Rescale user inputs.
pitches = pitch_min + pitches_raw*pitch_range
omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
gains = gain_min + gains_raw*gain_range
omegas = omega_C * 2**(pitches/12) + omega_offsets
values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)
That is basically the entire program. First, the constants.
dt
is the time in seconds between samples. dt = 1/sample_rate
, sample_rate = 44100
. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.
phase0
is the oscillator's phase at the beginning of the block you are currently processing.
gains
, pitches
, and omega_offsets
are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains
ranges from 0 to 1, pitches
ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets
ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.
You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.
Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.
In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.
small loops
Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.
vectorising
You're doing the same operations blockSize
times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.
#include <mkl.h>
// determines the size of a static array.
#define SASIZE(sa) (sizeof(sa)/sizeof(*sa))
// . . .
float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
float sin_phases[SASIZE(phases)];
vsSin(SASIZE(phases), phases, sin_phases);
// Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
// all at once, about 4 or 8 at a time using simd instructions.
float
This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.
integral phase
You can store and manipulate phase using uint32_t
s. It is naturally modular with modulus 2**32, so you never need to do any fmod
s or subtractions, especially repeated subtractions in a loop.
while (phase >= twopi) { phase -= twopi; } // idk about this...
That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.
#include <cstdio> // Hey, I like *printf().
#include <cinttypes>
// These are actually templated C++ math functions. The c doesn't
// stand for C, I guess...
#include <cmath>
using namespace std;
#define TAU 6.283185307179586
// fmodf, but always rounding toward negative infinity.
// The output will always be in [0, y). This is not the case
// with fmod(). See man fmod.
float fmodf_rd(float x, float y)
{
float r = fmod(x, y);
return r < 0.f? r + y : r;
}
uint32_t integral_phase(float phi)
{
return phi / (float)TAU * 0x1p32f;
}
float float_phase(uint32_t phi_i)
{
return (float)phi_i * (float)TAU * 0x1p-32f;
}
int main(int argc, const char *argv)
{
// Phases, in radians.
float alpha = TAU/12., beta = atan(2.f);
uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);
// Phase calculations in radians and floating point:
float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
// The same phase calculation in integral phase. Note there is no
// mod operation at all.
uint32_t gamma_i = 5*alpha_i - 7*beta_i;
printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
return 0;
}
Epilogue: An M. Night Shyamalan twist.
This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono
badness and replaced it with dtime(),
a wrapper I wrote around clock_gettime()
. dtime()
returns the current value of CLOCK_MONOTONIC
in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:
--- mod.cpp- 2018-11-20 03:19:11.091296221 -0500
+++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
@@ -1,7 +1,21 @@
-#include <iostream>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
#include <cstring>
#include <cmath>
-#include <chrono>
+
+double timespec_to_d(struct timespec *ts)
+{
+ return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
+}
+double dtime(void)
+{
+ struct timespec ts;
+ int res = clock_gettime(CLOCK_MONOTONIC, &ts);
+ assert(res == 0);
+ return timespec_to_d(&ts);
+}
const int voiceSize = 16;
const int bufferSize = 256;
@@ -154,7 +168,7 @@
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
- std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
+ double t0 = dtime();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
@@ -162,6 +176,8 @@
// do somethings with buffer
}
- std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
- std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
+ double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
+ dt_total = 1./sampleRate;
+ printf("CPU = %.6g %%n", dt_busy/dt_total);
+ return 0;
}
dt_busy
is the total amount of time your code takes to process a single sample. dt_total
, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.
aaaaaaand this is the output:
% g++ -o mod mod.cpp
% ./mod
CPU = 0.105791 %
%
When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.
New contributor
counting flops
Let's unpack the inner loop, where all the work is being done. I am repeating the whole thing here, wrapping to 80 columns and adding some indentation to make it easier to read.
double value = sin(phase) *
pGain->GetProcessedVoiceValue(voiceIndex, sampleIndex);
*left++ += value;
*right++ += value;
// next phase
phase += BOUNDED(
mRadiansPerSample * (
bp0 * GetPitchWarped(pPitch->GetProcessedVoiceValue(voiceIndex,
sampleIndex), pitchMin, pitchRange)
+ GetOffsetWarped(pOffset->GetProcessedVoiceValue(voiceIndex,
sampleIndex), offsetMin, offsetRange)
),
0, pi
);
while (phase >= twopi) { phase -= twopi; }
// . . .
inline double GetOffsetWarped(double normalizedValue, double min, double range)
{
return min + normalizedValue * range;
}
inline double GetPitchWarped(double normalizedValue, double min, double range)
{
return exp((min + normalizedValue * range) * ln2per12);
}
Let's see if I've got this straight. This is your program, as I understand it. You have 16 independent oscillators, each with their own time-varying frequency and gain. The frequency is a function of a pitch and an offset frequency. For now, I'll define pitch as separate from frequency (omega) in more or less the same manner you have done, switching to python for brevity: omega = omega_C * 2**(pitch/12)
, where omega_C = TAU*440*2**(-2 + 3/12)
is the frequency of the C below middle C. So pitch is in semitones and frequency is in rad/s. In numpy terms, you are doing the following for each oscillator:
# code sample 0
# Rescale user inputs.
pitches = pitch_min + pitches_raw*pitch_range
omega_offsets = omega_offset_min + omega_offsets_raw*omega_offset_range
gains = gain_min + gains_raw*gain_range
omegas = omega_C * 2**(pitches/12) + omega_offsets
values = gains*np.sin(phase0 + np.cumsum(omegas)*dt)
That is basically the entire program. First, the constants.
dt
is the time in seconds between samples. dt = 1/sample_rate
, sample_rate = 44100
. I have written the cumulative sum using this dt to illustrate the direct correspondence to the integral over a time-varying omega that the sum is approximating.
phase0
is the oscillator's phase at the beginning of the block you are currently processing.
gains
, pitches
, and omega_offsets
are three floating point user inputs, gains and pitches possibly being mapped to an after touch MIDI keyboard with a pitch bend and omega_offsets mapped to a sweet MIDI slider, all being sampled at the audio sample rate. They are being represented as numpy vectors, hence the esses. Gains
ranges from 0 to 1, pitches
ranges from 4 octaves below omega_C to 4 octaves above omega_C, and omega_offsets
ranges from -900Hz to 900Hz. (Incidentally, your gain min and max are not being used!) That offset frequency is super weird and kinda cool. That makes no sense from a conventional music theory standpoint. With any offset at all, pitch will no longer match up with octaves. I'd actually like to hear what that sounds like.
You could have made it easier for the reader to figure all that out. If I didn't already have some idea what to look for, I would have been completely lost. You should find a way to get your code to look as much like code sample 0 as possible.
Anyway, how fast should this loop run? How many flops does this loop translate to? Counting the number of operations per sample and using code sample 0 as a guide, I count 5 +s, 7 *s, an exp and a sin. Adding another + for when the 16 voices are added, that's 6 +s. Using this benchmark as a rough guideline, that translates to 6 + 7 + 10 + 15 = 38 flops per sample per voice. Multiplying by 16 voices and 44100 samples per second, that's 27 megaflops. A 1 GHz processor has a thousand cycles to do just 27 of those calculations. You should be seeing almost zero CPU on your toaster. There's something else going on here.
In fact, there's nothing wrong with your code. It was your benchmark. See the epilogue. But I left all these suggestions here anyway if for some reason you want your code to go even faster.
small loops
Your code might be slower than it could be because your inner loop is too big. Loops run fastest when they are as small as possible. This is because the fastest memory your system has—registers—is in extremely limited supply. If your loop is too big, the system runs out of registers and it has to use slower memory to run the loop. You can break up that inner loop into smaller loops—one per line in code sample 0. In theory, that should make your code run faster. But the compiler is actually able to figure out most of that, since everything is contained in a single file. It is able to inline whatever it wants and reorder execution however it wants, because it knows what every function is actually doing. This idea is closely related to vectorisation.
vectorising
You're doing the same operations blockSize
times. You should be able to speed that up with simd. The Intel MKL has simd-accelerated vectorised sin(), etc. that you can use. Code sample 0 gives you a basic idea of what the vectorised code is going to look like. The MKL docs don't give an example, so I'll give one now. As a reminder, you really don't have to do this. See the epilogue.
#include <mkl.h>
// determines the size of a static array.
#define SASIZE(sa) (sizeof(sa)/sizeof(*sa))
// . . .
float phases = {1.f, 2.f, 3.f, 4.f, 5.f};
float sin_phases[SASIZE(phases)];
vsSin(SASIZE(phases), phases, sin_phases);
// Now, sin_phases[i] = sin(phases[i]). All of the sin()s were computed
// all at once, about 4 or 8 at a time using simd instructions.
float
This is all audio data, so you don't need doubles. Floats will work just fine, and they'll be way way faster, especially after you get simd working.
integral phase
You can store and manipulate phase using uint32_t
s. It is naturally modular with modulus 2**32, so you never need to do any fmod
s or subtractions, especially repeated subtractions in a loop.
while (phase >= twopi) { phase -= twopi; } // idk about this...
That there is some weirdness. The mod 2**32 happens automatically when converting from float, adding, subtracting, whatever, so it should be a lot faster and a lot more accurate. You should only convert back to float phase before a sin operation or something like that. Here's an example. I also included an example of how to do the same modulo operation in all floating point numbers.
#include <cstdio> // Hey, I like *printf().
#include <cinttypes>
// These are actually templated C++ math functions. The c doesn't
// stand for C, I guess...
#include <cmath>
using namespace std;
#define TAU 6.283185307179586
// fmodf, but always rounding toward negative infinity.
// The output will always be in [0, y). This is not the case
// with fmod(). See man fmod.
float fmodf_rd(float x, float y)
{
float r = fmod(x, y);
return r < 0.f? r + y : r;
}
uint32_t integral_phase(float phi)
{
return phi / (float)TAU * 0x1p32f;
}
float float_phase(uint32_t phi_i)
{
return (float)phi_i * (float)TAU * 0x1p-32f;
}
int main(int argc, const char *argv)
{
// Phases, in radians.
float alpha = TAU/12., beta = atan(2.f);
uint32_t alpha_i = integral_phase(alpha), beta_i = integral_phase(beta);
// Phase calculations in radians and floating point:
float gamma = fmodf_rd(5.f*alpha - 7.f*beta, (float)TAU);
// The same phase calculation in integral phase. Note there is no
// mod operation at all.
uint32_t gamma_i = 5*alpha_i - 7*beta_i;
printf("difference = %.9gn", (double)(float_phase(gamma_i) - gamma));
return 0;
}
Epilogue: An M. Night Shyamalan twist.
This has been bothering me enough that I finally ran your code. You were saying you were getting slow results, but your code, though a little funky, did not look particularly slow actually. So I assumed it was something wrong with your benchmark. I nuked the C++ chrono
badness and replaced it with dtime(),
a wrapper I wrote around clock_gettime()
. dtime()
returns the current value of CLOCK_MONOTONIC
in seconds as a double. I also redid the benchmark math, outputting the performance in terms of %CPU. This is the diff:
--- mod.cpp- 2018-11-20 03:19:11.091296221 -0500
+++ mod.cpp 2018-11-20 03:39:39.529689861 -0500
@@ -1,7 +1,21 @@
-#include <iostream>
+#include <stdio.h>
+#include <time.h>
+#include <assert.h>
+
#include <cstring>
#include <cmath>
-#include <chrono>
+
+double timespec_to_d(struct timespec *ts)
+{
+ return (double)ts->tv_sec + 1e-9*(double)ts->tv_nsec;
+}
+double dtime(void)
+{
+ struct timespec ts;
+ int res = clock_gettime(CLOCK_MONOTONIC, &ts);
+ assert(res == 0);
+ return timespec_to_d(&ts);
+}
const int voiceSize = 16;
const int bufferSize = 256;
@@ -154,7 +168,7 @@
// audio host call
int numProcessing = 1024 * 50;
int counterProcessing = 0;
- std::chrono::high_resolution_clock::time_point pStart = std::chrono::high_resolution_clock::now();
+ double t0 = dtime();
while (counterProcessing++ < numProcessing) {
int blockSize = 256;
myPlugin.ProcessDoubleReplace(blockSize, bufferLeft, bufferRight);
@@ -162,6 +176,8 @@
// do somethings with buffer
}
- std::chrono::high_resolution_clock::time_point pEnd = std::chrono::high_resolution_clock::now();
- std::cout << "execution time: " << std::chrono::duration_cast<std::chrono::milliseconds>(pEnd - pStart).count() << " ms" << std::endl;
+ double dt_busy = (dtime() - t0)/numProcessing/bufferSize,
+ dt_total = 1./sampleRate;
+ printf("CPU = %.6g %%n", dt_busy/dt_total);
+ return 0;
}
dt_busy
is the total amount of time your code takes to process a single sample. dt_total
, the time between samples, is the total amount of time your code has to process a single sample. Divide the two and you predict %CPU usage while your DLL is running.
aaaaaaand this is the output:
% g++ -o mod mod.cpp
% ./mod
CPU = 0.105791 %
%
When running as a plugin with realtime user input streams, your code will use .1% CPU. It was your benchmark the entire time.
New contributor
edited yesterday
New contributor
answered 2 days ago
enigmaticPhysicist
1513
1513
New contributor
New contributor
Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
– markzzz
yesterday
Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
– markzzz
yesterday
Yeah, when set to zero. For every other value, though, lol
– enigmaticPhysicist
yesterday
for electronic music and "kickdrum synth" is a good tweak :)
– markzzz
yesterday
also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
– markzzz
yesterday
|
show 3 more comments
Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
– markzzz
yesterday
Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
– markzzz
yesterday
Yeah, when set to zero. For every other value, though, lol
– enigmaticPhysicist
yesterday
for electronic music and "kickdrum synth" is a good tweak :)
– markzzz
yesterday
also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
– markzzz
yesterday
Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
– markzzz
yesterday
Frequency offset, when set to 0, doesn't add offset, so pitch is perfectly tuned in octaves.
– markzzz
yesterday
Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
– markzzz
yesterday
Can you give to me an example with simd sin and vectorization? I've no idea how to use it.
– markzzz
yesterday
Yeah, when set to zero. For every other value, though, lol
– enigmaticPhysicist
yesterday
Yeah, when set to zero. For every other value, though, lol
– enigmaticPhysicist
yesterday
for electronic music and "kickdrum synth" is a good tweak :)
– markzzz
yesterday
for electronic music and "kickdrum synth" is a good tweak :)
– markzzz
yesterday
also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
– markzzz
yesterday
also: why on code sample 0 do you use "pow" instead of exp? Its way slower...
– markzzz
yesterday
|
show 3 more comments
up vote
4
down vote
General overview
I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.
The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue()
doesn't indicate what kind of processing was done; ln2per12
only captures how it was computed, rather than what it's for, and MyPlugin
looks like a generic temporary name.
Specific issues
BOUNDED()
is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp()
or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.
Variable-length arrays are not standard C++. Fix this by making voiceSize
and bufferSize
compile-time constants, aka constexpr
(rather than plain const
).
Prefer initialisation to assignment. In Param
, for example, we could make mMin
, mMax
and mRange
const if we initialize:
Param(double min, double max)
: mMin{min},
// mMax was never used, so don't store it
mRange{max - min}
{
}
// make them public, so we don't need `get_min()` etc.
const double mMin, mRange;
The uninitialised pModValues
pointer is especially dangerous here.
I don't understand why we're updating left
and right
identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy()
to duplicate it afterwards.
In Oscillator
, there's no need for pGain
, pOffset
and pPitch
to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc
) by making them simple member variables - don't write Java code!
Param pGain, pOffset, pPitch;
Oscillator()
: pGain{0.0, 1.0},
pOffset{-900.0, 900.0},
pPitch{-48.0, 48.0}
{ /* ... */ }
// destructor is now defaulted
Some functions from <cmath>
wrongly accessed in the global namespace (namely std::sin()
and std::exp()
); also std::memset()
from <cstring>
. These should be correctly qualified for portable code.
Prefer std::fill()
or std::fill_n()
over std::memset()
- the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:
std::fill_n(bufferLeft, blockSize, 0);
std::fill_n(bufferRight, blockSize, 0);
Declare main()
with no arguments when not intending to process command-line options.
If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native
to my compilation command).
BOUNDED: What do you mean "evalutate its arguments more than once"?
– markzzz
2 days ago
BOUNDED
expandsx
three times. That's misleading ifx
is an expression with side-effects (e.g.*p++
) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
– Toby Speight
2 days ago
I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
– markzzz
2 days ago
The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
– markzzz
2 days ago
add a comment |
up vote
4
down vote
General overview
I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.
The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue()
doesn't indicate what kind of processing was done; ln2per12
only captures how it was computed, rather than what it's for, and MyPlugin
looks like a generic temporary name.
Specific issues
BOUNDED()
is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp()
or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.
Variable-length arrays are not standard C++. Fix this by making voiceSize
and bufferSize
compile-time constants, aka constexpr
(rather than plain const
).
Prefer initialisation to assignment. In Param
, for example, we could make mMin
, mMax
and mRange
const if we initialize:
Param(double min, double max)
: mMin{min},
// mMax was never used, so don't store it
mRange{max - min}
{
}
// make them public, so we don't need `get_min()` etc.
const double mMin, mRange;
The uninitialised pModValues
pointer is especially dangerous here.
I don't understand why we're updating left
and right
identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy()
to duplicate it afterwards.
In Oscillator
, there's no need for pGain
, pOffset
and pPitch
to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc
) by making them simple member variables - don't write Java code!
Param pGain, pOffset, pPitch;
Oscillator()
: pGain{0.0, 1.0},
pOffset{-900.0, 900.0},
pPitch{-48.0, 48.0}
{ /* ... */ }
// destructor is now defaulted
Some functions from <cmath>
wrongly accessed in the global namespace (namely std::sin()
and std::exp()
); also std::memset()
from <cstring>
. These should be correctly qualified for portable code.
Prefer std::fill()
or std::fill_n()
over std::memset()
- the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:
std::fill_n(bufferLeft, blockSize, 0);
std::fill_n(bufferRight, blockSize, 0);
Declare main()
with no arguments when not intending to process command-line options.
If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native
to my compilation command).
BOUNDED: What do you mean "evalutate its arguments more than once"?
– markzzz
2 days ago
BOUNDED
expandsx
three times. That's misleading ifx
is an expression with side-effects (e.g.*p++
) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
– Toby Speight
2 days ago
I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
– markzzz
2 days ago
The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
– markzzz
2 days ago
add a comment |
up vote
4
down vote
up vote
4
down vote
General overview
I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.
The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue()
doesn't indicate what kind of processing was done; ln2per12
only captures how it was computed, rather than what it's for, and MyPlugin
looks like a generic temporary name.
Specific issues
BOUNDED()
is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp()
or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.
Variable-length arrays are not standard C++. Fix this by making voiceSize
and bufferSize
compile-time constants, aka constexpr
(rather than plain const
).
Prefer initialisation to assignment. In Param
, for example, we could make mMin
, mMax
and mRange
const if we initialize:
Param(double min, double max)
: mMin{min},
// mMax was never used, so don't store it
mRange{max - min}
{
}
// make them public, so we don't need `get_min()` etc.
const double mMin, mRange;
The uninitialised pModValues
pointer is especially dangerous here.
I don't understand why we're updating left
and right
identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy()
to duplicate it afterwards.
In Oscillator
, there's no need for pGain
, pOffset
and pPitch
to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc
) by making them simple member variables - don't write Java code!
Param pGain, pOffset, pPitch;
Oscillator()
: pGain{0.0, 1.0},
pOffset{-900.0, 900.0},
pPitch{-48.0, 48.0}
{ /* ... */ }
// destructor is now defaulted
Some functions from <cmath>
wrongly accessed in the global namespace (namely std::sin()
and std::exp()
); also std::memset()
from <cstring>
. These should be correctly qualified for portable code.
Prefer std::fill()
or std::fill_n()
over std::memset()
- the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:
std::fill_n(bufferLeft, blockSize, 0);
std::fill_n(bufferRight, blockSize, 0);
Declare main()
with no arguments when not intending to process command-line options.
If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native
to my compilation command).
General overview
I found this hard to follow. The logic is scattered across several coupled classes without a clear idea of a specific single responsibility for each class.
The class, function and variable names aren't as helpful as they could be - for example, GetProcessedVoiceValue()
doesn't indicate what kind of processing was done; ln2per12
only captures how it was computed, rather than what it's for, and MyPlugin
looks like a generic temporary name.
Specific issues
BOUNDED()
is a dangerous macro - it expands its arguments more than once, meaning that any side-effects can be multiply executed, unlike the equivalent function call. Prefer to use a function - either std::clamp()
or your own version of it (if you need to build with old compilers). Such a functions is generally a zero-cost abstraction in C++.
Variable-length arrays are not standard C++. Fix this by making voiceSize
and bufferSize
compile-time constants, aka constexpr
(rather than plain const
).
Prefer initialisation to assignment. In Param
, for example, we could make mMin
, mMax
and mRange
const if we initialize:
Param(double min, double max)
: mMin{min},
// mMax was never used, so don't store it
mRange{max - min}
{
}
// make them public, so we don't need `get_min()` etc.
const double mMin, mRange;
The uninitialised pModValues
pointer is especially dangerous here.
I don't understand why we're updating left
and right
identically. We can get better special locality by writing just one of them, and (if we really need two identical buffers) using std::copy()
to duplicate it afterwards.
In Oscillator
, there's no need for pGain
, pOffset
and pPitch
to be pointers. Eliminate the memory allocation (and risk of leaks when one of them throws std::bad_alloc
) by making them simple member variables - don't write Java code!
Param pGain, pOffset, pPitch;
Oscillator()
: pGain{0.0, 1.0},
pOffset{-900.0, 900.0},
pPitch{-48.0, 48.0}
{ /* ... */ }
// destructor is now defaulted
Some functions from <cmath>
wrongly accessed in the global namespace (namely std::sin()
and std::exp()
); also std::memset()
from <cstring>
. These should be correctly qualified for portable code.
Prefer std::fill()
or std::fill_n()
over std::memset()
- the former takes care of conversions and constructors where necessary, and is equivalent to the latter in simple cases, so it's a good habit to acquire:
std::fill_n(bufferLeft, blockSize, 0);
std::fill_n(bufferRight, blockSize, 0);
Declare main()
with no arguments when not intending to process command-line options.
If you can, make sure your compiler targets the particular hardware you'll be using (for me the speed more than doubles when I add -O3 -march=native
to my compilation command).
edited 2 days ago
answered 2 days ago
Toby Speight
22.2k536108
22.2k536108
BOUNDED: What do you mean "evalutate its arguments more than once"?
– markzzz
2 days ago
BOUNDED
expandsx
three times. That's misleading ifx
is an expression with side-effects (e.g.*p++
) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
– Toby Speight
2 days ago
I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
– markzzz
2 days ago
The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
– markzzz
2 days ago
add a comment |
BOUNDED: What do you mean "evalutate its arguments more than once"?
– markzzz
2 days ago
BOUNDED
expandsx
three times. That's misleading ifx
is an expression with side-effects (e.g.*p++
) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.
– Toby Speight
2 days ago
I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
– markzzz
2 days ago
The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
– markzzz
2 days ago
BOUNDED: What do you mean "evalutate its arguments more than once"?
– markzzz
2 days ago
BOUNDED: What do you mean "evalutate its arguments more than once"?
– markzzz
2 days ago
BOUNDED
expands x
three times. That's misleading if x
is an expression with side-effects (e.g. *p++
) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.– Toby Speight
2 days ago
BOUNDED
expands x
three times. That's misleading if x
is an expression with side-effects (e.g. *p++
) - it's very different from a function call. Similarly for the other arguments that appear multiple times in the expansion. That's why we use all-caps for macro names, but it's better to avoid the problem by using a function instead. There's usually no cost to that in C++.– Toby Speight
2 days ago
I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
– markzzz
2 days ago
I see. anyway, I've applied most of your suggestions, nothing change :) Compiler targets its the max I can (O2, or Ox in MSVC). Dunno its that!
– markzzz
2 days ago
The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
– markzzz
2 days ago
The logic of the program is: having a DLL (MyPlugin), call the ProcessDoubleReplace from a Host that load that DLL. The plugin need to take each param value and sum modulations values (which change across time). Once processed, I must apply these params (3 in this case: gain, pitch, offset) and produce audio (sine wave in this case). That's all, considering it process per voices :) Note: left and right is because signal is stereo, and in the future I could process them separately).
– markzzz
2 days ago
add a comment |
up vote
1
down vote
I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.
$$12 log_2 frac {130.81278} {55} approx 15$$
$$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$
$$f approx 130.812,782,650,299...$$
In C,
const double noteFrequency = 110*pow(2, 0.25);
add a comment |
up vote
1
down vote
I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.
$$12 log_2 frac {130.81278} {55} approx 15$$
$$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$
$$f approx 130.812,782,650,299...$$
In C,
const double noteFrequency = 110*pow(2, 0.25);
add a comment |
up vote
1
down vote
up vote
1
down vote
I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.
$$12 log_2 frac {130.81278} {55} approx 15$$
$$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$
$$f approx 130.812,782,650,299...$$
In C,
const double noteFrequency = 110*pow(2, 0.25);
I'm filing this as a separate answer because it's on a very different topic from my previous one. You have a constant for note frequency, but (a) it isn't precise and (b) you should compute it rather than writing the constant.
$$12 log_2 frac {130.81278} {55} approx 15$$
$$f = 55 times 2^{15/12} = 110 times 2^{1/4}$$
$$f approx 130.812,782,650,299...$$
In C,
const double noteFrequency = 110*pow(2, 0.25);
edited 2 days ago
answered 2 days ago
Reinderien
1,272516
1,272516
add a comment |
add a comment |
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f207967%2fmodulating-an-oscillator-by-gain-pitch-and-offset%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
I changed the title so that it describes what the code does per site goals: "State what your code does in your title, not your main concerns about it.". Please check that I haven't misrepresented your code, and correct it if I have.
– Toby Speight
2 days ago