diff --git a/cpp/src/neon_ladder.s b/cpp/src/neon_ladder.s
index 565e50c..aec0697 100644
--- a/cpp/src/neon_ladder.s
+++ b/cpp/src/neon_ladder.s
@@ -233,12 +233,12 @@ neon_ladder_lin_1:
     .type   neon_ladder_mkmatrix, %function
 neon_ladder_mkmatrix:
 @ r0 = pointer to params (a, k)
-@ r1 = out pointer to matrix (A then B, just like consumer)
+@ r1 = out pointer to matrix (B then A)
+@ TODO: produce output as A then B, avoiding copies
 	vpush {q4-q7}
 	vld1.32 {d0[]}, [r0]!  @ a
 	vmov.i32 d1, #0
-	vneg.f32 d2, d0
-	vmov.f32 s0, s4
+	vneg.f32 s0, s0
 
 	vld1.32 {d6[0]}, [r0]  @ k
 	vmov.i32 q2, #0
@@ -305,28 +305,29 @@ neon_ladder_mkmatrix2:
 	bne neon_ladder_mkmatrix2
 
 	@ unwrap toeplitz matrix into the full form
-	vst1.32 {q0}, [r1]!
+	add r1, #16  @ TODO: remove this for A then B output
+	vst1.32 {q0}, [r1,:128]!
 	vext.32 q8, q2, q0, #3
-	vst1.32 {q8}, [r1]!
+	vst1.32 {q8}, [r1,:128]!
 	vext.32 q9, q2, q0, #2
-	vst1.32 {q9}, [r1]!
+	vst1.32 {q9}, [r1,:128]!
 	vext.32 q10, q2, q0, #1
-	vst1.32 {q10}, [r1]!
+	vst1.32 {q10}, [r1,:128]!
+	sub r1, #80  @ TODO: remove this for A then B output
 
-	adr r2, neon_ladder_mkmatrix_const
-	vld1.32 {d2[], d3[]}, [r2]
+	vmov.f32 q1, 1.0
 	vld1.32 {d4[0]}, [r0]  @ k
 	vadd.f32 d4, d2
 	vrecpe.f32 d6, d4
-	vrecps.f32 d2, d6, d4
+	vrecps.f32 d5, d6, d4
 
 	vadd.f32 q0, q8
 	vadd.f32 q9, q10
 	vsub.f32 q15, q1, q0
-	vmul.f32 d6, d2  @ 1 / (1 + k)
+	vmul.f32 d6, d5  @ 1 / (1 + k)
 	vsub.f32 q15, q9
 	vmul.f32 q15, d6[0]
-	vst1.32 {q15}, [r1]!
+	vst1.32 {q15}, [r1,:128]!
 
 	vpop {q4-q7}
 	bx lr
diff --git a/cpp/src/resofilter.cc b/cpp/src/resofilter.cc
index 80aaffc..7db35bb 100644
--- a/cpp/src/resofilter.cc
+++ b/cpp/src/resofilter.cc
@@ -38,6 +38,7 @@ void neon_ladder_nl(const int32_t *in, const float *a, int32_t *out, int count,
 extern "C"
 void neon_ladder_lin(const int32_t *in, const float *a, int32_t *out, int count,
   float *state);
+extern "C" void neon_ladder_mkmatrix(const float *in, float *out);
 #endif
 
 double this_sample_rate;
@@ -56,7 +57,6 @@ ResoFilter::ResoFilter() {
 }
 
 int32_t compute_alpha(int32_t logf) {
-  // TODO: better tuning
   return min(1 << 24, Freqlut::lookup(logf));
 }
 
@@ -178,9 +178,15 @@ static void make_state_transition(float result[20], int32_t f0, int32_t k) {
 }
 
 void test_matrix() {
-  float a[20];
-  make_state_transition(a, 1.0 * (1 << 24), 3.99 * (1 << 24));
-  dump_matrix(a);
+  float params[2] = {1.0, 3.99};
+  AlignedBuf<float, 20> a;
+  make_state_transition(a.get(), params[0] * (1 << 24), params[1] * (1 << 24));
+  dump_matrix(a.get());
+#ifdef HAVE_NEON
+  params[0] /= 16;
+  neon_ladder_mkmatrix(params, a.get());
+  dump_matrix(a.get());
+#endif
 }
 
 #if defined(USE_MATRIX)
@@ -193,11 +199,20 @@ static float sigmoid(float x, float overdrive) {
 void ResoFilter::process(const int32_t **inbufs, const int32_t *control_in,
                          const int32_t *control_last, int32_t **outbufs) {
   AlignedBuf<float, 20> a;
-  make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]);
   float overdrive = control_in[2] * (1.0 / (1 << 24));
   const int32_t *ibuf = inbufs[0];
   int32_t *obuf = outbufs[0];
   bool useneon = hasNeon();
+  if (useneon) {
+#ifdef HAVE_NEON
+    float params[2];
+    params[0] = compute_alpha(control_in[0]) * (1.0 / (1 << 28));
+    params[1] = control_in[1] * (1.0 / (1 << 24));
+    neon_ladder_mkmatrix(params, a.get());
+#endif
+  } else {
+    make_state_transition(a.get(), compute_alpha(control_in[0]), control_in[1]);
+  }
 
   if (overdrive < 0.01) {
     if (useneon) {