FFmpeg  3.4.9
wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * @brief Windows Media Audio Voice compatible decoder
25  * @author Ronald S. Bultje <rsbultje@gmail.com>
26  */
27 
28 #include <math.h>
29 
31 #include "libavutil/float_dsp.h"
32 #include "libavutil/mem.h"
33 #include "avcodec.h"
34 #include "internal.h"
35 #include "get_bits.h"
36 #include "put_bits.h"
37 #include "wmavoice_data.h"
38 #include "celp_filters.h"
39 #include "acelp_vectors.h"
40 #include "acelp_filters.h"
41 #include "lsp.h"
42 #include "dct.h"
43 #include "rdft.h"
44 #include "sinewin.h"
45 
46 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
47 #define MAX_LSPS 16 ///< maximum filter order
48 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
49  ///< of 16 for ASM input buffer alignment
50 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
51 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
52 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
53 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
54  ///< maximum number of samples per superframe
55 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
56  ///< was split over two packets
57 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
58 
59 /**
60  * Frame type VLC coding.
61  */
63 
64 /**
65  * Adaptive codebook types.
66  */
67 enum {
68  ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
69  ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
70  ///< we interpolate to get a per-sample pitch.
71  ///< Signal is generated using an asymmetric sinc
72  ///< window function
73  ///< @note see #wmavoice_ipol1_coeffs
74  ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
75  ///< a Hamming sinc window function
76  ///< @note see #wmavoice_ipol2_coeffs
77 };
78 
79 /**
80  * Fixed codebook types.
81  */
82 enum {
83  FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
84  ///< generated from a hardcoded (fixed) codebook
85  ///< with per-frame (low) gain values
86  FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
87  ///< gain values
88  FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
89  ///< used in particular for low-bitrate streams
90  FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
91  ///< combinations of either single pulses or
92  ///< pulse pairs
93 };
94 
95 /**
96  * Description of frame types.
97  */
98 static const struct frame_type_desc {
99  uint8_t n_blocks; ///< amount of blocks per frame (each block
100  ///< (contains 160/#n_blocks samples)
101  uint8_t log_n_blocks; ///< log2(#n_blocks)
102  uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
103  uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
104  uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
105  ///< (rather than just one single pulse)
106  ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
107 } frame_descs[17] = {
108  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0 },
109  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0 },
125 };
126 
127 /**
128  * WMA Voice decoding context.
129  */
130 typedef struct WMAVoiceContext {
131  /**
132  * @name Global values specified in the stream header / extradata or used all over.
133  * @{
134  */
135  GetBitContext gb; ///< packet bitreader. During decoder init,
136  ///< it contains the extradata from the
137  ///< demuxer. During decoding, it contains
138  ///< packet data.
139  int8_t vbm_tree[25]; ///< converts VLC codes to frame type
140 
141  int spillover_bitsize; ///< number of bits used to specify
142  ///< #spillover_nbits in the packet header
143  ///< = ceil(log2(ctx->block_align << 3))
144  int history_nsamples; ///< number of samples in history for signal
145  ///< prediction (through ACB)
146 
147  /* postfilter specific values */
148  int do_apf; ///< whether to apply the averaged
149  ///< projection filter (APF)
150  int denoise_strength; ///< strength of denoising in Wiener filter
151  ///< [0-11]
152  int denoise_tilt_corr; ///< Whether to apply tilt correction to the
153  ///< Wiener filter coefficients (postfilter)
154  int dc_level; ///< Predicted amount of DC noise, based
155  ///< on which a DC removal filter is used
156 
157  int lsps; ///< number of LSPs per frame [10 or 16]
158  int lsp_q_mode; ///< defines quantizer defaults [0, 1]
159  int lsp_def_mode; ///< defines different sets of LSP defaults
160  ///< [0, 1]
161 
162  int min_pitch_val; ///< base value for pitch parsing code
163  int max_pitch_val; ///< max value + 1 for pitch parsing
164  int pitch_nbits; ///< number of bits used to specify the
165  ///< pitch value in the frame header
166  int block_pitch_nbits; ///< number of bits used to specify the
167  ///< first block's pitch value
168  int block_pitch_range; ///< range of the block pitch
169  int block_delta_pitch_nbits; ///< number of bits used to specify the
170  ///< delta pitch between this and the last
171  ///< block's pitch value, used in all but
172  ///< first block
173  int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
174  ///< from -this to +this-1)
175  uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
176  ///< conversion
177 
178  /**
179  * @}
180  *
181  * @name Packet values specified in the packet header or related to a packet.
182  *
183  * A packet is considered to be a single unit of data provided to this
184  * decoder by the demuxer.
185  * @{
186  */
187  int spillover_nbits; ///< number of bits of the previous packet's
188  ///< last superframe preceding this
189  ///< packet's first full superframe (useful
190  ///< for re-synchronization also)
191  int has_residual_lsps; ///< if set, superframes contain one set of
192  ///< LSPs that cover all frames, encoded as
193  ///< independent and residual LSPs; if not
194  ///< set, each frame contains its own, fully
195  ///< independent, LSPs
196  int skip_bits_next; ///< number of bits to skip at the next call
197  ///< to #wmavoice_decode_packet() (since
198  ///< they're part of the previous superframe)
199 
201  ///< cache for superframe data split over
202  ///< multiple packets
203  int sframe_cache_size; ///< set to >0 if we have data from an
204  ///< (incomplete) superframe from a previous
205  ///< packet that spilled over in the current
206  ///< packet; specifies the amount of bits in
207  ///< #sframe_cache
208  PutBitContext pb; ///< bitstream writer for #sframe_cache
209 
210  /**
211  * @}
212  *
213  * @name Frame and superframe values
214  * Superframe and frame data - these can change from frame to frame,
215  * although some of them do in that case serve as a cache / history for
216  * the next frame or superframe.
217  * @{
218  */
219  double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
220  ///< superframe
221  int last_pitch_val; ///< pitch value of the previous frame
222  int last_acb_type; ///< frame type [0-2] of the previous frame
223  int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
224  ///< << 16) / #MAX_FRAMESIZE
225  float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
226 
227  int aw_idx_is_ext; ///< whether the AW index was encoded in
228  ///< 8 bits (instead of 6)
229  int aw_pulse_range; ///< the range over which #aw_pulse_set1()
230  ///< can apply the pulse, relative to the
231  ///< value in aw_first_pulse_off. The exact
232  ///< position of the first AW-pulse is within
233  ///< [pulse_off, pulse_off + this], and
234  ///< depends on bitstream values; [16 or 24]
235  int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
236  ///< that this number can be negative (in
237  ///< which case it basically means "zero")
238  int aw_first_pulse_off[2]; ///< index of first sample to which to
239  ///< apply AW-pulses, or -0xff if unset
240  int aw_next_pulse_off_cache; ///< the position (relative to start of the
241  ///< second block) at which pulses should
242  ///< start to be positioned, serves as a
243  ///< cache for pitch-adaptive window pulses
244  ///< between blocks
245 
246  int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
247  ///< only used for comfort noise in #pRNG()
248  int nb_superframes; ///< number of superframes in current packet
249  float gain_pred_err[6]; ///< cache for gain prediction
250  float excitation_history[MAX_SIGNAL_HISTORY];
251  ///< cache of the signal of previous
252  ///< superframes, used as a history for
253  ///< signal generation
254  float synth_history[MAX_LSPS]; ///< see #excitation_history
255  /**
256  * @}
257  *
258  * @name Postfilter values
259  *
260  * Variables used for postfilter implementation, mostly history for
261  * smoothing and so on, and context variables for FFT/iFFT.
262  * @{
263  */
264  RDFTContext rdft, irdft; ///< contexts for FFT-calculation in the
265  ///< postfilter (for denoise filter)
266  DCTContext dct, dst; ///< contexts for phase shift (in Hilbert
267  ///< transform, part of postfilter)
268  float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
269  ///< range
270  float postfilter_agc; ///< gain control memory, used in
271  ///< #adaptive_gain_control()
272  float dcf_mem[2]; ///< DC filter history
273  float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
274  ///< zero filter output (i.e. excitation)
275  ///< by postfilter
276  float denoise_filter_cache[MAX_FRAMESIZE];
277  int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
278  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
279  ///< aligned buffer for LPC tilting
280  DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
281  ///< aligned buffer for denoise coefficients
282  DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
283  ///< aligned buffer for postfilter speech
284  ///< synthesis
285  /**
286  * @}
287  */
289 
290 /**
291  * Set up the variable bit mode (VBM) tree from container extradata.
292  * @param gb bit I/O context.
293  * The bit context (s->gb) should be loaded with byte 23-46 of the
294  * container extradata (i.e. the ones containing the VBM tree).
295  * @param vbm_tree pointer to array to which the decoded VBM tree will be
296  * written.
297  * @return 0 on success, <0 on error.
298  */
299 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
300 {
301  int cntr[8] = { 0 }, n, res;
302 
303  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
304  for (n = 0; n < 17; n++) {
305  res = get_bits(gb, 3);
306  if (cntr[res] > 3) // should be >= 3 + (res == 7))
307  return -1;
308  vbm_tree[res * 3 + cntr[res]++] = n;
309  }
310  return 0;
311 }
312 
314 {
315  static const uint8_t bits[] = {
316  2, 2, 2, 4, 4, 4,
317  6, 6, 6, 8, 8, 8,
318  10, 10, 10, 12, 12, 12,
319  14, 14, 14, 14
320  };
321  static const uint16_t codes[] = {
322  0x0000, 0x0001, 0x0002, // 00/01/10
323  0x000c, 0x000d, 0x000e, // 11+00/01/10
324  0x003c, 0x003d, 0x003e, // 1111+00/01/10
325  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
326  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
327  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
328  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
329  };
330 
331  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
332  bits, 1, 1, codes, 2, 2, 132);
333 }
334 
336 {
337  WMAVoiceContext *s = ctx->priv_data;
338  int n;
339 
340  s->postfilter_agc = 0;
341  s->sframe_cache_size = 0;
342  s->skip_bits_next = 0;
343  for (n = 0; n < s->lsps; n++)
344  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
345  memset(s->excitation_history, 0,
346  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
347  memset(s->synth_history, 0,
348  sizeof(*s->synth_history) * MAX_LSPS);
349  memset(s->gain_pred_err, 0,
350  sizeof(s->gain_pred_err));
351 
352  if (s->do_apf) {
353  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
354  sizeof(*s->synth_filter_out_buf) * s->lsps);
355  memset(s->dcf_mem, 0,
356  sizeof(*s->dcf_mem) * 2);
357  memset(s->zero_exc_pf, 0,
358  sizeof(*s->zero_exc_pf) * s->history_nsamples);
359  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
360  }
361 }
362 
363 /**
364  * Set up decoder with parameters from demuxer (extradata etc.).
365  */
367 {
368  int n, flags, pitch_range, lsp16_flag;
369  WMAVoiceContext *s = ctx->priv_data;
370 
371  /**
372  * Extradata layout:
373  * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
374  * - byte 19-22: flags field (annoyingly in LE; see below for known
375  * values),
376  * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
377  * rest is 0).
378  */
379  if (ctx->extradata_size != 46) {
380  av_log(ctx, AV_LOG_ERROR,
381  "Invalid extradata size %d (should be 46)\n",
382  ctx->extradata_size);
383  return AVERROR_INVALIDDATA;
384  }
385  if (ctx->block_align <= 0 || ctx->block_align > (1<<22)) {
386  av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
387  return AVERROR_INVALIDDATA;
388  }
389 
390  flags = AV_RL32(ctx->extradata + 18);
391  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
392  s->do_apf = flags & 0x1;
393  if (s->do_apf) {
394  ff_rdft_init(&s->rdft, 7, DFT_R2C);
395  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
396  ff_dct_init(&s->dct, 6, DCT_I);
397  ff_dct_init(&s->dst, 6, DST_I);
398 
399  ff_sine_window_init(s->cos, 256);
400  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
401  for (n = 0; n < 255; n++) {
402  s->sin[n] = -s->sin[510 - n];
403  s->cos[510 - n] = s->cos[n];
404  }
405  }
406  s->denoise_strength = (flags >> 2) & 0xF;
407  if (s->denoise_strength >= 12) {
408  av_log(ctx, AV_LOG_ERROR,
409  "Invalid denoise filter strength %d (max=11)\n",
410  s->denoise_strength);
411  return AVERROR_INVALIDDATA;
412  }
413  s->denoise_tilt_corr = !!(flags & 0x40);
414  s->dc_level = (flags >> 7) & 0xF;
415  s->lsp_q_mode = !!(flags & 0x2000);
416  s->lsp_def_mode = !!(flags & 0x4000);
417  lsp16_flag = flags & 0x1000;
418  if (lsp16_flag) {
419  s->lsps = 16;
420  } else {
421  s->lsps = 10;
422  }
423  for (n = 0; n < s->lsps; n++)
424  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
425 
426  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
427  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
428  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
429  return AVERROR_INVALIDDATA;
430  }
431 
432  if (ctx->sample_rate >= INT_MAX / (256 * 37))
433  return AVERROR_INVALIDDATA;
434 
435  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
436  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
437  pitch_range = s->max_pitch_val - s->min_pitch_val;
438  if (pitch_range <= 0) {
439  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
440  return AVERROR_INVALIDDATA;
441  }
442  s->pitch_nbits = av_ceil_log2(pitch_range);
443  s->last_pitch_val = 40;
445  s->history_nsamples = s->max_pitch_val + 8;
446 
448  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
449  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
450 
451  av_log(ctx, AV_LOG_ERROR,
452  "Unsupported samplerate %d (min=%d, max=%d)\n",
453  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
454 
455  return AVERROR(ENOSYS);
456  }
457 
458  s->block_conv_table[0] = s->min_pitch_val;
459  s->block_conv_table[1] = (pitch_range * 25) >> 6;
460  s->block_conv_table[2] = (pitch_range * 44) >> 6;
461  s->block_conv_table[3] = s->max_pitch_val - 1;
462  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
463  if (s->block_delta_pitch_hrange <= 0) {
464  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
465  return AVERROR_INVALIDDATA;
466  }
467  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
469  s->block_conv_table[3] + 1 +
470  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
471  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
472 
473  ctx->channels = 1;
476 
477  return 0;
478 }
479 
480 /**
481  * @name Postfilter functions
482  * Postfilter functions (gain control, wiener denoise filter, DC filter,
483  * kalman smoothening, plus surrounding code to wrap it)
484  * @{
485  */
486 /**
487  * Adaptive gain control (as used in postfilter).
488  *
489  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
490  * that the energy here is calculated using sum(abs(...)), whereas the
491  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
492  *
493  * @param out output buffer for filtered samples
494  * @param in input buffer containing the samples as they are after the
495  * postfilter steps so far
496  * @param speech_synth input buffer containing speech synth before postfilter
497  * @param size input buffer size
498  * @param alpha exponential filter factor
499  * @param gain_mem pointer to filter memory (single float)
500  */
501 static void adaptive_gain_control(float *out, const float *in,
502  const float *speech_synth,
503  int size, float alpha, float *gain_mem)
504 {
505  int i;
506  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
507  float mem = *gain_mem;
508 
509  for (i = 0; i < size; i++) {
510  speech_energy += fabsf(speech_synth[i]);
511  postfilter_energy += fabsf(in[i]);
512  }
513  gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
514  (1.0 - alpha) * speech_energy / postfilter_energy;
515 
516  for (i = 0; i < size; i++) {
517  mem = alpha * mem + gain_scale_factor;
518  out[i] = in[i] * mem;
519  }
520 
521  *gain_mem = mem;
522 }
523 
524 /**
525  * Kalman smoothing function.
526  *
527  * This function looks back pitch +/- 3 samples back into history to find
528  * the best fitting curve (that one giving the optimal gain of the two
529  * signals, i.e. the highest dot product between the two), and then
530  * uses that signal history to smoothen the output of the speech synthesis
531  * filter.
532  *
533  * @param s WMA Voice decoding context
534  * @param pitch pitch of the speech signal
535  * @param in input speech signal
536  * @param out output pointer for smoothened signal
537  * @param size input/output buffer size
538  *
539  * @returns -1 if no smoothening took place, e.g. because no optimal
540  * fit could be found, or 0 on success.
541  */
542 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
543  const float *in, float *out, int size)
544 {
545  int n;
546  float optimal_gain = 0, dot;
547  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
548  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
549  *best_hist_ptr = NULL;
550 
551  /* find best fitting point in history */
552  do {
553  dot = avpriv_scalarproduct_float_c(in, ptr, size);
554  if (dot > optimal_gain) {
555  optimal_gain = dot;
556  best_hist_ptr = ptr;
557  }
558  } while (--ptr >= end);
559 
560  if (optimal_gain <= 0)
561  return -1;
562  dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
563  if (dot <= 0) // would be 1.0
564  return -1;
565 
566  if (optimal_gain <= dot) {
567  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
568  } else
569  dot = 0.625;
570 
571  /* actual smoothing */
572  for (n = 0; n < size; n++)
573  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
574 
575  return 0;
576 }
577 
578 /**
579  * Get the tilt factor of a formant filter from its transfer function
580  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
581  * but somehow (??) it does a speech synthesis filter in the
582  * middle, which is missing here
583  *
584  * @param lpcs LPC coefficients
585  * @param n_lpcs Size of LPC buffer
586  * @returns the tilt factor
587  */
588 static float tilt_factor(const float *lpcs, int n_lpcs)
589 {
590  float rh0, rh1;
591 
592  rh0 = 1.0 + avpriv_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
593  rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
594 
595  return rh1 / rh0;
596 }
597 
598 /**
599  * Derive denoise filter coefficients (in real domain) from the LPCs.
600  */
601 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
602  int fcb_type, float *coeffs, int remainder)
603 {
604  float last_coeff, min = 15.0, max = -15.0;
605  float irange, angle_mul, gain_mul, range, sq;
606  int n, idx;
607 
608  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
609  s->rdft.rdft_calc(&s->rdft, lpcs);
610 #define log_range(var, assign) do { \
611  float tmp = log10f(assign); var = tmp; \
612  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
613  } while (0)
614  log_range(last_coeff, lpcs[1] * lpcs[1]);
615  for (n = 1; n < 64; n++)
616  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
617  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
618  log_range(lpcs[0], lpcs[0] * lpcs[0]);
619 #undef log_range
620  range = max - min;
621  lpcs[64] = last_coeff;
622 
623  /* Now, use this spectrum to pick out these frequencies with higher
624  * (relative) power/energy (which we then take to be "not noise"),
625  * and set up a table (still in lpc[]) of (relative) gains per frequency.
626  * These frequencies will be maintained, while others ("noise") will be
627  * decreased in the filter output. */
628  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
629  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
630  (5.0 / 14.7));
631  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
632  for (n = 0; n <= 64; n++) {
633  float pwr;
634 
635  idx = lrint((max - lpcs[n]) * irange - 1);
636  idx = FFMAX(0, idx);
638  lpcs[n] = angle_mul * pwr;
639 
640  /* 70.57 =~ 1/log10(1.0331663) */
641  idx = av_clipf((pwr * gain_mul - 0.0295) * 70.570526123, 0, INT_MAX / 2);
642 
643  if (idx > 127) { // fall back if index falls outside table range
644  coeffs[n] = wmavoice_energy_table[127] *
645  powf(1.0331663, idx - 127);
646  } else
647  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
648  }
649 
650  /* calculate the Hilbert transform of the gains, which we do (since this
651  * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
652  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
653  * "moment" of the LPCs in this filter. */
654  s->dct.dct_calc(&s->dct, lpcs);
655  s->dst.dct_calc(&s->dst, lpcs);
656 
657  /* Split out the coefficient indexes into phase/magnitude pairs */
658  idx = 255 + av_clip(lpcs[64], -255, 255);
659  coeffs[0] = coeffs[0] * s->cos[idx];
660  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
661  last_coeff = coeffs[64] * s->cos[idx];
662  for (n = 63;; n--) {
663  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
664  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
665  coeffs[n * 2] = coeffs[n] * s->cos[idx];
666 
667  if (!--n) break;
668 
669  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
670  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
671  coeffs[n * 2] = coeffs[n] * s->cos[idx];
672  }
673  coeffs[1] = last_coeff;
674 
675  /* move into real domain */
676  s->irdft.rdft_calc(&s->irdft, coeffs);
677 
678  /* tilt correction and normalize scale */
679  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
680  if (s->denoise_tilt_corr) {
681  float tilt_mem = 0;
682 
683  coeffs[remainder - 1] = 0;
684  ff_tilt_compensation(&tilt_mem,
685  -1.8 * tilt_factor(coeffs, remainder - 1),
686  coeffs, remainder);
687  }
688  sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
689  remainder));
690  for (n = 0; n < remainder; n++)
691  coeffs[n] *= sq;
692 }
693 
694 /**
695  * This function applies a Wiener filter on the (noisy) speech signal as
696  * a means to denoise it.
697  *
698  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
699  * - using this power spectrum, calculate (for each frequency) the Wiener
700  * filter gain, which depends on the frequency power and desired level
701  * of noise subtraction (when set too high, this leads to artifacts)
702  * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
703  * of 4-8kHz);
704  * - by doing a phase shift, calculate the Hilbert transform of this array
705  * of per-frequency filter-gains to get the filtering coefficients;
706  * - smoothen/normalize/de-tilt these filter coefficients as desired;
707  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
708  * to get the denoised speech signal;
709  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
710  * the frame boundary) are saved and applied to subsequent frames by an
711  * overlap-add method (otherwise you get clicking-artifacts).
712  *
713  * @param s WMA Voice decoding context
714  * @param fcb_type Frame (codebook) type
715  * @param synth_pf input: the noisy speech signal, output: denoised speech
716  * data; should be 16-byte aligned (for ASM purposes)
717  * @param size size of the speech data
718  * @param lpcs LPCs used to synthesize this frame's speech data
719  */
721  float *synth_pf, int size,
722  const float *lpcs)
723 {
724  int remainder, lim, n;
725 
726  if (fcb_type != FCB_TYPE_SILENCE) {
727  float *tilted_lpcs = s->tilted_lpcs_pf,
728  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
729 
730  tilted_lpcs[0] = 1.0;
731  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
732  memset(&tilted_lpcs[s->lsps + 1], 0,
733  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
734  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
735  tilted_lpcs, s->lsps + 2);
736 
737  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
738  * size is applied to the next frame. All input beyond this is zero,
739  * and thus all output beyond this will go towards zero, hence we can
740  * limit to min(size-1, 127-size) as a performance consideration. */
741  remainder = FFMIN(127 - size, size - 1);
742  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
743 
744  /* apply coefficients (in frequency spectrum domain), i.e. complex
745  * number multiplication */
746  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
747  s->rdft.rdft_calc(&s->rdft, synth_pf);
748  s->rdft.rdft_calc(&s->rdft, coeffs);
749  synth_pf[0] *= coeffs[0];
750  synth_pf[1] *= coeffs[1];
751  for (n = 1; n < 64; n++) {
752  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
753  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
754  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
755  }
756  s->irdft.rdft_calc(&s->irdft, synth_pf);
757  }
758 
759  /* merge filter output with the history of previous runs */
760  if (s->denoise_filter_cache_size) {
761  lim = FFMIN(s->denoise_filter_cache_size, size);
762  for (n = 0; n < lim; n++)
763  synth_pf[n] += s->denoise_filter_cache[n];
764  s->denoise_filter_cache_size -= lim;
765  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
767  }
768 
769  /* move remainder of filter output into a cache for future runs */
770  if (fcb_type != FCB_TYPE_SILENCE) {
771  lim = FFMIN(remainder, s->denoise_filter_cache_size);
772  for (n = 0; n < lim; n++)
773  s->denoise_filter_cache[n] += synth_pf[size + n];
774  if (lim < remainder) {
775  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
776  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
777  s->denoise_filter_cache_size = remainder;
778  }
779  }
780 }
781 
782 /**
783  * Averaging projection filter, the postfilter used in WMAVoice.
784  *
785  * This uses the following steps:
786  * - A zero-synthesis filter (generate excitation from synth signal)
787  * - Kalman smoothing on excitation, based on pitch
788  * - Re-synthesized smoothened output
789  * - Iterative Wiener denoise filter
790  * - Adaptive gain filter
791  * - DC filter
792  *
793  * @param s WMAVoice decoding context
794  * @param synth Speech synthesis output (before postfilter)
795  * @param samples Output buffer for filtered samples
796  * @param size Buffer size of synth & samples
797  * @param lpcs Generated LPCs used for speech synthesis
798  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
799  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
800  * @param pitch Pitch of the input signal
801  */
802 static void postfilter(WMAVoiceContext *s, const float *synth,
803  float *samples, int size,
804  const float *lpcs, float *zero_exc_pf,
805  int fcb_type, int pitch)
806 {
807  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
808  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
809  *synth_filter_in = zero_exc_pf;
810 
811  av_assert0(size <= MAX_FRAMESIZE / 2);
812 
813  /* generate excitation from input signal */
814  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
815 
816  if (fcb_type >= FCB_TYPE_AW_PULSES &&
817  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
818  synth_filter_in = synth_filter_in_buf;
819 
820  /* re-synthesize speech after smoothening, and keep history */
821  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
822  synth_filter_in, size, s->lsps);
823  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
824  sizeof(synth_pf[0]) * s->lsps);
825 
826  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
827 
828  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
829  &s->postfilter_agc);
830 
831  if (s->dc_level > 8) {
832  /* remove ultra-low frequency DC noise / highpass filter;
833  * coefficients are identical to those used in SIPR decoding,
834  * and very closely resemble those used in AMR-NB decoding. */
836  (const float[2]) { -1.99997, 1.0 },
837  (const float[2]) { -1.9330735188, 0.93589198496 },
838  0.93980580475, s->dcf_mem, size);
839  }
840 }
841 /**
842  * @}
843  */
844 
845 /**
846  * Dequantize LSPs
847  * @param lsps output pointer to the array that will hold the LSPs
848  * @param num number of LSPs to be dequantized
849  * @param values quantized values, contains n_stages values
850  * @param sizes range (i.e. max value) of each quantized value
851  * @param n_stages number of dequantization runs
852  * @param table dequantization table to be used
853  * @param mul_q LSF multiplier
854  * @param base_q base (lowest) LSF values
855  */
856 static void dequant_lsps(double *lsps, int num,
857  const uint16_t *values,
858  const uint16_t *sizes,
859  int n_stages, const uint8_t *table,
860  const double *mul_q,
861  const double *base_q)
862 {
863  int n, m;
864 
865  memset(lsps, 0, num * sizeof(*lsps));
866  for (n = 0; n < n_stages; n++) {
867  const uint8_t *t_off = &table[values[n] * num];
868  double base = base_q[n], mul = mul_q[n];
869 
870  for (m = 0; m < num; m++)
871  lsps[m] += base + mul * t_off[m];
872 
873  table += sizes[n] * num;
874  }
875 }
876 
877 /**
878  * @name LSP dequantization routines
879  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
880  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
881  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
882  * @{
883  */
884 /**
885  * Parse 10 independently-coded LSPs.
886  */
887 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
888 {
889  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
890  static const double mul_lsf[4] = {
891  5.2187144800e-3, 1.4626986422e-3,
892  9.6179549166e-4, 1.1325736225e-3
893  };
894  static const double base_lsf[4] = {
895  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
896  M_PI * -3.3486e-2, M_PI * -5.7408e-2
897  };
898  uint16_t v[4];
899 
900  v[0] = get_bits(gb, 8);
901  v[1] = get_bits(gb, 6);
902  v[2] = get_bits(gb, 5);
903  v[3] = get_bits(gb, 5);
904 
905  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
906  mul_lsf, base_lsf);
907 }
908 
909 /**
910  * Parse 10 independently-coded LSPs, and then derive the tables to
911  * generate LSPs for the other frames from them (residual coding).
912  */
914  double *i_lsps, const double *old,
915  double *a1, double *a2, int q_mode)
916 {
917  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
918  static const double mul_lsf[3] = {
919  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
920  };
921  static const double base_lsf[3] = {
922  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
923  };
924  const float (*ipol_tab)[2][10] = q_mode ?
926  uint16_t interpol, v[3];
927  int n;
928 
929  dequant_lsp10i(gb, i_lsps);
930 
931  interpol = get_bits(gb, 5);
932  v[0] = get_bits(gb, 7);
933  v[1] = get_bits(gb, 6);
934  v[2] = get_bits(gb, 6);
935 
936  for (n = 0; n < 10; n++) {
937  double delta = old[n] - i_lsps[n];
938  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
939  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
940  }
941 
942  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
943  mul_lsf, base_lsf);
944 }
945 
946 /**
947  * Parse 16 independently-coded LSPs.
948  */
949 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
950 {
951  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
952  static const double mul_lsf[5] = {
953  3.3439586280e-3, 6.9908173703e-4,
954  3.3216608306e-3, 1.0334960326e-3,
955  3.1899104283e-3
956  };
957  static const double base_lsf[5] = {
958  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
959  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
960  M_PI * -1.29816e-1
961  };
962  uint16_t v[5];
963 
964  v[0] = get_bits(gb, 8);
965  v[1] = get_bits(gb, 6);
966  v[2] = get_bits(gb, 7);
967  v[3] = get_bits(gb, 6);
968  v[4] = get_bits(gb, 7);
969 
970  dequant_lsps( lsps, 5, v, vec_sizes, 2,
971  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
972  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
973  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
974  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
975  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
976 }
977 
978 /**
979  * Parse 16 independently-coded LSPs, and then derive the tables to
980  * generate LSPs for the other frames from them (residual coding).
981  */
983  double *i_lsps, const double *old,
984  double *a1, double *a2, int q_mode)
985 {
986  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
987  static const double mul_lsf[3] = {
988  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
989  };
990  static const double base_lsf[3] = {
991  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
992  };
993  const float (*ipol_tab)[2][16] = q_mode ?
995  uint16_t interpol, v[3];
996  int n;
997 
998  dequant_lsp16i(gb, i_lsps);
999 
1000  interpol = get_bits(gb, 5);
1001  v[0] = get_bits(gb, 7);
1002  v[1] = get_bits(gb, 7);
1003  v[2] = get_bits(gb, 7);
1004 
1005  for (n = 0; n < 16; n++) {
1006  double delta = old[n] - i_lsps[n];
1007  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1008  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1009  }
1010 
1011  dequant_lsps( a2, 10, v, vec_sizes, 1,
1012  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
1013  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1014  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1015  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1016  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1017 }
1018 
1019 /**
1020  * @}
1021  * @name Pitch-adaptive window coding functions
1022  * The next few functions are for pitch-adaptive window coding.
1023  * @{
1024  */
1025 /**
1026  * Parse the offset of the first pitch-adaptive window pulses, and
1027  * the distribution of pulses between the two blocks in this frame.
1028  * @param s WMA Voice decoding context private data
1029  * @param gb bit I/O context
1030  * @param pitch pitch for each block in this frame
1031  */
1033  const int *pitch)
1034 {
1035  static const int16_t start_offset[94] = {
1036  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1037  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1038  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1039  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1040  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1041  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1042  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1043  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1044  };
1045  int bits, offset;
1046 
1047  /* position of pulse */
1048  s->aw_idx_is_ext = 0;
1049  if ((bits = get_bits(gb, 6)) >= 54) {
1050  s->aw_idx_is_ext = 1;
1051  bits += (bits - 54) * 3 + get_bits(gb, 2);
1052  }
1053 
1054  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1055  * the distribution of the pulses in each block contained in this frame. */
1056  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1057  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1058  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1059  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1060  offset += s->aw_n_pulses[0] * pitch[0];
1061  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1062  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1063 
1064  /* if continuing from a position before the block, reset position to
1065  * start of block (when corrected for the range over which it can be
1066  * spread in aw_pulse_set1()). */
1067  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1068  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1069  s->aw_first_pulse_off[1] -= pitch[1];
1070  if (start_offset[bits] < 0)
1071  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1072  s->aw_first_pulse_off[0] -= pitch[0];
1073  }
1074 }
1075 
1076 /**
1077  * Apply second set of pitch-adaptive window pulses.
1078  * @param s WMA Voice decoding context private data
1079  * @param gb bit I/O context
1080  * @param block_idx block index in frame [0, 1]
1081  * @param fcb structure containing fixed codebook vector info
1082  * @return -1 on error, 0 otherwise
1083  */
1085  int block_idx, AMRFixed *fcb)
1086 {
1087  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1088  uint16_t *use_mask = use_mask_mem + 2;
1089  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1090  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1091  * of idx are the position of the bit within a particular item in the
1092  * array (0 being the most significant bit, and 15 being the least
1093  * significant bit), and the remainder (>> 4) is the index in the
1094  * use_mask[]-array. This is faster and uses less memory than using a
1095  * 80-byte/80-int array. */
1096  int pulse_off = s->aw_first_pulse_off[block_idx],
1097  pulse_start, n, idx, range, aidx, start_off = 0;
1098 
1099  /* set offset of first pulse to within this block */
1100  if (s->aw_n_pulses[block_idx] > 0)
1101  while (pulse_off + s->aw_pulse_range < 1)
1102  pulse_off += fcb->pitch_lag;
1103 
1104  /* find range per pulse */
1105  if (s->aw_n_pulses[0] > 0) {
1106  if (block_idx == 0) {
1107  range = 32;
1108  } else /* block_idx = 1 */ {
1109  range = 8;
1110  if (s->aw_n_pulses[block_idx] > 0)
1111  pulse_off = s->aw_next_pulse_off_cache;
1112  }
1113  } else
1114  range = 16;
1115  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1116 
1117  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1118  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1119  * we exclude that range from being pulsed again in this function. */
1120  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1121  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1122  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1123  if (s->aw_n_pulses[block_idx] > 0)
1124  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1125  int excl_range = s->aw_pulse_range; // always 16 or 24
1126  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1127  int first_sh = 16 - (idx & 15);
1128  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1129  excl_range -= first_sh;
1130  if (excl_range >= 16) {
1131  *use_mask_ptr++ = 0;
1132  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1133  } else
1134  *use_mask_ptr &= 0xFFFF >> excl_range;
1135  }
1136 
1137  /* find the 'aidx'th offset that is not excluded */
1138  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1139  for (n = 0; n <= aidx; pulse_start++) {
1140  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1141  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1142  if (use_mask[0]) idx = 0x0F;
1143  else if (use_mask[1]) idx = 0x1F;
1144  else if (use_mask[2]) idx = 0x2F;
1145  else if (use_mask[3]) idx = 0x3F;
1146  else if (use_mask[4]) idx = 0x4F;
1147  else return -1;
1148  idx -= av_log2_16bit(use_mask[idx >> 4]);
1149  }
1150  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1151  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1152  n++;
1153  start_off = idx;
1154  }
1155  }
1156 
1157  fcb->x[fcb->n] = start_off;
1158  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1159  fcb->n++;
1160 
1161  /* set offset for next block, relative to start of that block */
1162  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1163  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1164  return 0;
1165 }
1166 
1167 /**
1168  * Apply first set of pitch-adaptive window pulses.
1169  * @param s WMA Voice decoding context private data
1170  * @param gb bit I/O context
1171  * @param block_idx block index in frame [0, 1]
1172  * @param fcb storage location for fixed codebook pulse info
1173  */
1175  int block_idx, AMRFixed *fcb)
1176 {
1177  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1178  float v;
1179 
1180  if (s->aw_n_pulses[block_idx] > 0) {
1181  int n, v_mask, i_mask, sh, n_pulses;
1182 
1183  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1184  n_pulses = 3;
1185  v_mask = 8;
1186  i_mask = 7;
1187  sh = 4;
1188  } else { // 4 pulses, 1:sign + 2:index each
1189  n_pulses = 4;
1190  v_mask = 4;
1191  i_mask = 3;
1192  sh = 3;
1193  }
1194 
1195  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1196  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1197  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1198  s->aw_first_pulse_off[block_idx];
1199  while (fcb->x[fcb->n] < 0)
1200  fcb->x[fcb->n] += fcb->pitch_lag;
1201  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1202  fcb->n++;
1203  }
1204  } else {
1205  int num2 = (val & 0x1FF) >> 1, delta, idx;
1206 
1207  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1208  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1209  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1210  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1211  v = (val & 0x200) ? -1.0 : 1.0;
1212 
1213  fcb->no_repeat_mask |= 3 << fcb->n;
1214  fcb->x[fcb->n] = idx - delta;
1215  fcb->y[fcb->n] = v;
1216  fcb->x[fcb->n + 1] = idx;
1217  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1218  fcb->n += 2;
1219  }
1220 }
1221 
1222 /**
1223  * @}
1224  *
1225  * Generate a random number from frame_cntr and block_idx, which will live
1226  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1227  * table of size 1000 of which you want to read block_size entries).
1228  *
1229  * @param frame_cntr current frame number
1230  * @param block_num current block index
1231  * @param block_size amount of entries we want to read from a table
1232  * that has 1000 entries
1233  * @return a (non-)random number in the [0, 1000 - block_size] range.
1234  */
1235 static int pRNG(int frame_cntr, int block_num, int block_size)
1236 {
1237  /* array to simplify the calculation of z:
1238  * y = (x % 9) * 5 + 6;
1239  * z = (49995 * x) / y;
1240  * Since y only has 9 values, we can remove the division by using a
1241  * LUT and using FASTDIV-style divisions. For each of the 9 values
1242  * of y, we can rewrite z as:
1243  * z = x * (49995 / y) + x * ((49995 % y) / y)
1244  * In this table, each col represents one possible value of y, the
1245  * first number is 49995 / y, and the second is the FASTDIV variant
1246  * of 49995 % y / y. */
1247  static const unsigned int div_tbl[9][2] = {
1248  { 8332, 3 * 715827883U }, // y = 6
1249  { 4545, 0 * 390451573U }, // y = 11
1250  { 3124, 11 * 268435456U }, // y = 16
1251  { 2380, 15 * 204522253U }, // y = 21
1252  { 1922, 23 * 165191050U }, // y = 26
1253  { 1612, 23 * 138547333U }, // y = 31
1254  { 1388, 27 * 119304648U }, // y = 36
1255  { 1219, 16 * 104755300U }, // y = 41
1256  { 1086, 39 * 93368855U } // y = 46
1257  };
1258  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1259  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1260  // so this is effectively a modulo (%)
1261  y = x - 9 * MULH(477218589, x); // x % 9
1262  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1263  // z = x * 49995 / (y * 5 + 6)
1264  return z % (1000 - block_size);
1265 }
1266 
1267 /**
1268  * Parse hardcoded signal for a single block.
1269  * @note see #synth_block().
1270  */
1272  int block_idx, int size,
1273  const struct frame_type_desc *frame_desc,
1274  float *excitation)
1275 {
1276  float gain;
1277  int n, r_idx;
1278 
1279  av_assert0(size <= MAX_FRAMESIZE);
1280 
1281  /* Set the offset from which we start reading wmavoice_std_codebook */
1282  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1283  r_idx = pRNG(s->frame_cntr, block_idx, size);
1284  gain = s->silence_gain;
1285  } else /* FCB_TYPE_HARDCODED */ {
1286  r_idx = get_bits(gb, 8);
1287  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1288  }
1289 
1290  /* Clear gain prediction parameters */
1291  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1292 
1293  /* Apply gain to hardcoded codebook and use that as excitation signal */
1294  for (n = 0; n < size; n++)
1295  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1296 }
1297 
1298 /**
1299  * Parse FCB/ACB signal for a single block.
1300  * @note see #synth_block().
1301  */
1303  int block_idx, int size,
1304  int block_pitch_sh2,
1305  const struct frame_type_desc *frame_desc,
1306  float *excitation)
1307 {
1308  static const float gain_coeff[6] = {
1309  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1310  };
1311  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1312  int n, idx, gain_weight;
1313  AMRFixed fcb;
1314 
1315  av_assert0(size <= MAX_FRAMESIZE / 2);
1316  memset(pulses, 0, sizeof(*pulses) * size);
1317 
1318  fcb.pitch_lag = block_pitch_sh2 >> 2;
1319  fcb.pitch_fac = 1.0;
1320  fcb.no_repeat_mask = 0;
1321  fcb.n = 0;
1322 
1323  /* For the other frame types, this is where we apply the innovation
1324  * (fixed) codebook pulses of the speech signal. */
1325  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1326  aw_pulse_set1(s, gb, block_idx, &fcb);
1327  if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1328  /* Conceal the block with silence and return.
1329  * Skip the correct amount of bits to read the next
1330  * block from the correct offset. */
1331  int r_idx = pRNG(s->frame_cntr, block_idx, size);
1332 
1333  for (n = 0; n < size; n++)
1334  excitation[n] =
1335  wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1336  skip_bits(gb, 7 + 1);
1337  return;
1338  }
1339  } else /* FCB_TYPE_EXC_PULSES */ {
1340  int offset_nbits = 5 - frame_desc->log_n_blocks;
1341 
1342  fcb.no_repeat_mask = -1;
1343  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1344  * (instead of double) for a subset of pulses */
1345  for (n = 0; n < 5; n++) {
1346  float sign;
1347  int pos1, pos2;
1348 
1349  sign = get_bits1(gb) ? 1.0 : -1.0;
1350  pos1 = get_bits(gb, offset_nbits);
1351  fcb.x[fcb.n] = n + 5 * pos1;
1352  fcb.y[fcb.n++] = sign;
1353  if (n < frame_desc->dbl_pulses) {
1354  pos2 = get_bits(gb, offset_nbits);
1355  fcb.x[fcb.n] = n + 5 * pos2;
1356  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1357  }
1358  }
1359  }
1360  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1361 
1362  /* Calculate gain for adaptive & fixed codebook signal.
1363  * see ff_amr_set_fixed_gain(). */
1364  idx = get_bits(gb, 7);
1366  gain_coeff, 6) -
1367  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1368  acb_gain = wmavoice_gain_codebook_acb[idx];
1369  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1370  -2.9957322736 /* log(0.05) */,
1371  1.6094379124 /* log(5.0) */);
1372 
1373  gain_weight = 8 >> frame_desc->log_n_blocks;
1374  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1375  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1376  for (n = 0; n < gain_weight; n++)
1377  s->gain_pred_err[n] = pred_err;
1378 
1379  /* Calculation of adaptive codebook */
1380  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1381  int len;
1382  for (n = 0; n < size; n += len) {
1383  int next_idx_sh16;
1384  int abs_idx = block_idx * size + n;
1385  int pitch_sh16 = (s->last_pitch_val << 16) +
1386  s->pitch_diff_sh16 * abs_idx;
1387  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1388  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1389  idx = idx_sh16 >> 16;
1390  if (s->pitch_diff_sh16) {
1391  if (s->pitch_diff_sh16 > 0) {
1392  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1393  } else
1394  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1395  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1396  1, size - n);
1397  } else
1398  len = size;
1399 
1400  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1402  idx, 9, len);
1403  }
1404  } else /* ACB_TYPE_HAMMING */ {
1405  int block_pitch = block_pitch_sh2 >> 2;
1406  idx = block_pitch_sh2 & 3;
1407  if (idx) {
1408  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1410  idx, 8, size);
1411  } else
1412  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1413  sizeof(float) * size);
1414  }
1415 
1416  /* Interpolate ACB/FCB and use as excitation signal */
1417  ff_weighted_vector_sumf(excitation, excitation, pulses,
1418  acb_gain, fcb_gain, size);
1419 }
1420 
1421 /**
1422  * Parse data in a single block.
1423  *
1424  * @param s WMA Voice decoding context private data
1425  * @param gb bit I/O context
1426  * @param block_idx index of the to-be-read block
1427  * @param size amount of samples to be read in this block
1428  * @param block_pitch_sh2 pitch for this block << 2
1429  * @param lsps LSPs for (the end of) this frame
1430  * @param prev_lsps LSPs for the last frame
1431  * @param frame_desc frame type descriptor
1432  * @param excitation target memory for the ACB+FCB interpolated signal
1433  * @param synth target memory for the speech synthesis filter output
1434  * @return 0 on success, <0 on error.
1435  */
1437  int block_idx, int size,
1438  int block_pitch_sh2,
1439  const double *lsps, const double *prev_lsps,
1440  const struct frame_type_desc *frame_desc,
1441  float *excitation, float *synth)
1442 {
1443  double i_lsps[MAX_LSPS];
1444  float lpcs[MAX_LSPS];
1445  float fac;
1446  int n;
1447 
1448  if (frame_desc->acb_type == ACB_TYPE_NONE)
1449  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1450  else
1451  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1452  frame_desc, excitation);
1453 
1454  /* convert interpolated LSPs to LPCs */
1455  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1456  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1457  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1458  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1459 
1460  /* Speech synthesis */
1461  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1462 }
1463 
1464 /**
1465  * Synthesize output samples for a single frame.
1466  *
1467  * @param ctx WMA Voice decoder context
1468  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1469  * @param frame_idx Frame number within superframe [0-2]
1470  * @param samples pointer to output sample buffer, has space for at least 160
1471  * samples
1472  * @param lsps LSP array
1473  * @param prev_lsps array of previous frame's LSPs
1474  * @param excitation target buffer for excitation signal
1475  * @param synth target buffer for synthesized speech data
1476  * @return 0 on success, <0 on error.
1477  */
1478 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1479  float *samples,
1480  const double *lsps, const double *prev_lsps,
1481  float *excitation, float *synth)
1482 {
1483  WMAVoiceContext *s = ctx->priv_data;
1484  int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1485  int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1486 
1487  /* Parse frame type ("frame header"), see frame_descs */
1488  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1489 
1490  if (bd_idx < 0) {
1491  av_log(ctx, AV_LOG_ERROR,
1492  "Invalid frame type VLC code, skipping\n");
1493  return AVERROR_INVALIDDATA;
1494  }
1495 
1496  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1497 
1498  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1499  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1500  /* Pitch is provided per frame, which is interpreted as the pitch of
1501  * the last sample of the last block of this frame. We can interpolate
1502  * the pitch of other blocks (and even pitch-per-sample) by gradually
1503  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1504  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1505  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1506  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1507  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1508  if (s->last_acb_type == ACB_TYPE_NONE ||
1509  20 * abs(cur_pitch_val - s->last_pitch_val) >
1510  (cur_pitch_val + s->last_pitch_val))
1511  s->last_pitch_val = cur_pitch_val;
1512 
1513  /* pitch per block */
1514  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1515  int fac = n * 2 + 1;
1516 
1517  pitch[n] = (MUL16(fac, cur_pitch_val) +
1518  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1519  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1520  }
1521 
1522  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1523  s->pitch_diff_sh16 =
1524  (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1525  }
1526 
1527  /* Global gain (if silence) and pitch-adaptive window coordinates */
1528  switch (frame_descs[bd_idx].fcb_type) {
1529  case FCB_TYPE_SILENCE:
1531  break;
1532  case FCB_TYPE_AW_PULSES:
1533  aw_parse_coords(s, gb, pitch);
1534  break;
1535  }
1536 
1537  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1538  int bl_pitch_sh2;
1539 
1540  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1541  switch (frame_descs[bd_idx].acb_type) {
1542  case ACB_TYPE_HAMMING: {
1543  /* Pitch is given per block. Per-block pitches are encoded as an
1544  * absolute value for the first block, and then delta values
1545  * relative to this value) for all subsequent blocks. The scale of
1546  * this pitch value is semi-logarithmic compared to its use in the
1547  * decoder, so we convert it to normal scale also. */
1548  int block_pitch,
1549  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1550  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1551  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1552 
1553  if (n == 0) {
1554  block_pitch = get_bits(gb, s->block_pitch_nbits);
1555  } else
1556  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1558  /* Convert last_ so that any next delta is within _range */
1559  last_block_pitch = av_clip(block_pitch,
1561  s->block_pitch_range -
1563 
1564  /* Convert semi-log-style scale back to normal scale */
1565  if (block_pitch < t1) {
1566  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1567  } else {
1568  block_pitch -= t1;
1569  if (block_pitch < t2) {
1570  bl_pitch_sh2 =
1571  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1572  } else {
1573  block_pitch -= t2;
1574  if (block_pitch < t3) {
1575  bl_pitch_sh2 =
1576  (s->block_conv_table[2] + block_pitch) << 2;
1577  } else
1578  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1579  }
1580  }
1581  pitch[n] = bl_pitch_sh2 >> 2;
1582  break;
1583  }
1584 
1585  case ACB_TYPE_ASYMMETRIC: {
1586  bl_pitch_sh2 = pitch[n] << 2;
1587  break;
1588  }
1589 
1590  default: // ACB_TYPE_NONE has no pitch
1591  bl_pitch_sh2 = 0;
1592  break;
1593  }
1594 
1595  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1596  lsps, prev_lsps, &frame_descs[bd_idx],
1597  &excitation[n * block_nsamples],
1598  &synth[n * block_nsamples]);
1599  }
1600 
1601  /* Averaging projection filter, if applicable. Else, just copy samples
1602  * from synthesis buffer */
1603  if (s->do_apf) {
1604  double i_lsps[MAX_LSPS];
1605  float lpcs[MAX_LSPS];
1606 
1607  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1608  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1609  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1610  postfilter(s, synth, samples, 80, lpcs,
1611  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1612  frame_descs[bd_idx].fcb_type, pitch[0]);
1613 
1614  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1615  i_lsps[n] = cos(lsps[n]);
1616  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1617  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1618  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1619  frame_descs[bd_idx].fcb_type, pitch[0]);
1620  } else
1621  memcpy(samples, synth, 160 * sizeof(synth[0]));
1622 
1623  /* Cache values for next frame */
1624  s->frame_cntr++;
1625  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1626  s->last_acb_type = frame_descs[bd_idx].acb_type;
1627  switch (frame_descs[bd_idx].acb_type) {
1628  case ACB_TYPE_NONE:
1629  s->last_pitch_val = 0;
1630  break;
1631  case ACB_TYPE_ASYMMETRIC:
1632  s->last_pitch_val = cur_pitch_val;
1633  break;
1634  case ACB_TYPE_HAMMING:
1635  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1636  break;
1637  }
1638 
1639  return 0;
1640 }
1641 
1642 /**
1643  * Ensure minimum value for first item, maximum value for last value,
1644  * proper spacing between each value and proper ordering.
1645  *
1646  * @param lsps array of LSPs
1647  * @param num size of LSP array
1648  *
1649  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1650  * useful to put in a generic location later on. Parts are also
1651  * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1652  * which is in float.
1653  */
1654 static void stabilize_lsps(double *lsps, int num)
1655 {
1656  int n, m, l;
1657 
1658  /* set minimum value for first, maximum value for last and minimum
1659  * spacing between LSF values.
1660  * Very similar to ff_set_min_dist_lsf(), but in double. */
1661  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1662  for (n = 1; n < num; n++)
1663  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1664  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1665 
1666  /* reorder (looks like one-time / non-recursed bubblesort).
1667  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1668  for (n = 1; n < num; n++) {
1669  if (lsps[n] < lsps[n - 1]) {
1670  for (m = 1; m < num; m++) {
1671  double tmp = lsps[m];
1672  for (l = m - 1; l >= 0; l--) {
1673  if (lsps[l] <= tmp) break;
1674  lsps[l + 1] = lsps[l];
1675  }
1676  lsps[l + 1] = tmp;
1677  }
1678  break;
1679  }
1680  }
1681 }
1682 
1683 /**
1684  * Synthesize output samples for a single superframe. If we have any data
1685  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1686  * in s->gb.
1687  *
1688  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1689  * to give a total of 480 samples per frame. See #synth_frame() for frame
1690  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1691  * (if these are globally specified for all frames (residually); they can
1692  * also be specified individually per-frame. See the s->has_residual_lsps
1693  * option), and can specify the number of samples encoded in this superframe
1694  * (if less than 480), usually used to prevent blanks at track boundaries.
1695  *
1696  * @param ctx WMA Voice decoder context
1697  * @return 0 on success, <0 on error or 1 if there was not enough data to
1698  * fully parse the superframe
1699  */
1701  int *got_frame_ptr)
1702 {
1703  WMAVoiceContext *s = ctx->priv_data;
1704  GetBitContext *gb = &s->gb, s_gb;
1705  int n, res, n_samples = MAX_SFRAMESIZE;
1706  double lsps[MAX_FRAMES][MAX_LSPS];
1707  const double *mean_lsf = s->lsps == 16 ?
1709  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1710  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1711  float *samples;
1712 
1713  memcpy(synth, s->synth_history,
1714  s->lsps * sizeof(*synth));
1715  memcpy(excitation, s->excitation_history,
1716  s->history_nsamples * sizeof(*excitation));
1717 
1718  if (s->sframe_cache_size > 0) {
1719  gb = &s_gb;
1721  s->sframe_cache_size = 0;
1722  }
1723 
1724  /* First bit is speech/music bit, it differentiates between WMAVoice
1725  * speech samples (the actual codec) and WMAVoice music samples, which
1726  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1727  * the wild yet. */
1728  if (!get_bits1(gb)) {
1729  avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1730  return AVERROR_PATCHWELCOME;
1731  }
1732 
1733  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1734  if (get_bits1(gb)) {
1735  if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1736  av_log(ctx, AV_LOG_ERROR,
1737  "Superframe encodes > %d samples (%d), not allowed\n",
1738  MAX_SFRAMESIZE, n_samples);
1739  return AVERROR_INVALIDDATA;
1740  }
1741  }
1742 
1743  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1744  if (s->has_residual_lsps) {
1745  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1746 
1747  for (n = 0; n < s->lsps; n++)
1748  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1749 
1750  if (s->lsps == 10) {
1751  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1752  } else /* s->lsps == 16 */
1753  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1754 
1755  for (n = 0; n < s->lsps; n++) {
1756  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1757  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1758  lsps[2][n] += mean_lsf[n];
1759  }
1760  for (n = 0; n < 3; n++)
1761  stabilize_lsps(lsps[n], s->lsps);
1762  }
1763 
1764  /* get output buffer */
1765  frame->nb_samples = MAX_SFRAMESIZE;
1766  if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1767  return res;
1768  frame->nb_samples = n_samples;
1769  samples = (float *)frame->data[0];
1770 
1771  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1772  for (n = 0; n < 3; n++) {
1773  if (!s->has_residual_lsps) {
1774  int m;
1775 
1776  if (s->lsps == 10) {
1777  dequant_lsp10i(gb, lsps[n]);
1778  } else /* s->lsps == 16 */
1779  dequant_lsp16i(gb, lsps[n]);
1780 
1781  for (m = 0; m < s->lsps; m++)
1782  lsps[n][m] += mean_lsf[m];
1783  stabilize_lsps(lsps[n], s->lsps);
1784  }
1785 
1786  if ((res = synth_frame(ctx, gb, n,
1787  &samples[n * MAX_FRAMESIZE],
1788  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1789  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1790  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1791  *got_frame_ptr = 0;
1792  return res;
1793  }
1794  }
1795 
1796  /* Statistics? FIXME - we don't check for length, a slight overrun
1797  * will be caught by internal buffer padding, and anything else
1798  * will be skipped, not read. */
1799  if (get_bits1(gb)) {
1800  res = get_bits(gb, 4);
1801  skip_bits(gb, 10 * (res + 1));
1802  }
1803 
1804  if (get_bits_left(gb) < 0) {
1805  wmavoice_flush(ctx);
1806  return AVERROR_INVALIDDATA;
1807  }
1808 
1809  *got_frame_ptr = 1;
1810 
1811  /* Update history */
1812  memcpy(s->prev_lsps, lsps[2],
1813  s->lsps * sizeof(*s->prev_lsps));
1814  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1815  s->lsps * sizeof(*synth));
1816  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1817  s->history_nsamples * sizeof(*excitation));
1818  if (s->do_apf)
1819  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1820  s->history_nsamples * sizeof(*s->zero_exc_pf));
1821 
1822  return 0;
1823 }
1824 
1825 /**
1826  * Parse the packet header at the start of each packet (input data to this
1827  * decoder).
1828  *
1829  * @param s WMA Voice decoding context private data
1830  * @return <0 on error, nb_superframes on success.
1831  */
1833 {
1834  GetBitContext *gb = &s->gb;
1835  unsigned int res, n_superframes = 0;
1836 
1837  skip_bits(gb, 4); // packet sequence number
1838  s->has_residual_lsps = get_bits1(gb);
1839  do {
1840  if (get_bits_left(gb) < 6 + s->spillover_bitsize)
1841  return AVERROR_INVALIDDATA;
1842 
1843  res = get_bits(gb, 6); // number of superframes per packet
1844  // (minus first one if there is spillover)
1845  n_superframes += res;
1846  } while (res == 0x3F);
1848 
1849  return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1850 }
1851 
1852 /**
1853  * Copy (unaligned) bits from gb/data/size to pb.
1854  *
1855  * @param pb target buffer to copy bits into
1856  * @param data source buffer to copy bits from
1857  * @param size size of the source data, in bytes
1858  * @param gb bit I/O context specifying the current position in the source.
1859  * data. This function might use this to align the bit position to
1860  * a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1861  * source data
1862  * @param nbits the amount of bits to copy from source to target
1863  *
1864  * @note after calling this function, the current position in the input bit
1865  * I/O context is undefined.
1866  */
1867 static void copy_bits(PutBitContext *pb,
1868  const uint8_t *data, int size,
1869  GetBitContext *gb, int nbits)
1870 {
1871  int rmn_bytes, rmn_bits;
1872 
1873  rmn_bits = rmn_bytes = get_bits_left(gb);
1874  if (rmn_bits < nbits)
1875  return;
1876  if (nbits > pb->size_in_bits - put_bits_count(pb))
1877  return;
1878  rmn_bits &= 7; rmn_bytes >>= 3;
1879  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1880  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1881  avpriv_copy_bits(pb, data + size - rmn_bytes,
1882  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1883 }
1884 
1885 /**
1886  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1887  * and we expect that the demuxer / application provides it to us as such
1888  * (else you'll probably get garbage as output). Every packet has a size of
1889  * ctx->block_align bytes, starts with a packet header (see
1890  * #parse_packet_header()), and then a series of superframes. Superframe
1891  * boundaries may exceed packets, i.e. superframes can split data over
1892  * multiple (two) packets.
1893  *
1894  * For more information about frames, see #synth_superframe().
1895  */
1897  int *got_frame_ptr, AVPacket *avpkt)
1898 {
1899  WMAVoiceContext *s = ctx->priv_data;
1900  GetBitContext *gb = &s->gb;
1901  int size, res, pos;
1902 
1903  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1904  * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1905  * feeds us ASF packets, which may concatenate multiple "codec" packets
1906  * in a single "muxer" packet, so we artificially emulate that by
1907  * capping the packet size at ctx->block_align. */
1908  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1909  init_get_bits(&s->gb, avpkt->data, size << 3);
1910 
1911  /* size == ctx->block_align is used to indicate whether we are dealing with
1912  * a new packet or a packet of which we already read the packet header
1913  * previously. */
1914  if (!(size % ctx->block_align)) { // new packet header
1915  if (!size) {
1916  s->spillover_nbits = 0;
1917  s->nb_superframes = 0;
1918  } else {
1919  if ((res = parse_packet_header(s)) < 0)
1920  return res;
1921  s->nb_superframes = res;
1922  }
1923 
1924  /* If the packet header specifies a s->spillover_nbits, then we want
1925  * to push out all data of the previous packet (+ spillover) before
1926  * continuing to parse new superframes in the current packet. */
1927  if (s->sframe_cache_size > 0) {
1928  int cnt = get_bits_count(gb);
1929  if (cnt + s->spillover_nbits > avpkt->size * 8) {
1930  s->spillover_nbits = avpkt->size * 8 - cnt;
1931  }
1932  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1933  flush_put_bits(&s->pb);
1935  if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1936  *got_frame_ptr) {
1937  cnt += s->spillover_nbits;
1938  s->skip_bits_next = cnt & 7;
1939  res = cnt >> 3;
1940  return res;
1941  } else
1942  skip_bits_long (gb, s->spillover_nbits - cnt +
1943  get_bits_count(gb)); // resync
1944  } else if (s->spillover_nbits) {
1945  skip_bits_long(gb, s->spillover_nbits); // resync
1946  }
1947  } else if (s->skip_bits_next)
1948  skip_bits(gb, s->skip_bits_next);
1949 
1950  /* Try parsing superframes in current packet */
1951  s->sframe_cache_size = 0;
1952  s->skip_bits_next = 0;
1953  pos = get_bits_left(gb);
1954  if (s->nb_superframes-- == 0) {
1955  *got_frame_ptr = 0;
1956  return size;
1957  } else if (s->nb_superframes > 0) {
1958  if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1959  return res;
1960  } else if (*got_frame_ptr) {
1961  int cnt = get_bits_count(gb);
1962  s->skip_bits_next = cnt & 7;
1963  res = cnt >> 3;
1964  return res;
1965  }
1966  } else if ((s->sframe_cache_size = pos) > 0) {
1967  /* ... cache it for spillover in next packet */
1969  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1970  // FIXME bad - just copy bytes as whole and add use the
1971  // skip_bits_next field
1972  }
1973 
1974  return size;
1975 }
1976 
1978 {
1979  WMAVoiceContext *s = ctx->priv_data;
1980 
1981  if (s->do_apf) {
1982  ff_rdft_end(&s->rdft);
1983  ff_rdft_end(&s->irdft);
1984  ff_dct_end(&s->dct);
1985  ff_dct_end(&s->dst);
1986  }
1987 
1988  return 0;
1989 }
1990 
1992  .name = "wmavoice",
1993  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
1994  .type = AVMEDIA_TYPE_AUDIO,
1995  .id = AV_CODEC_ID_WMAVOICE,
1996  .priv_data_size = sizeof(WMAVoiceContext),
1998  .init_static_data = wmavoice_init_static_data,
1999  .close = wmavoice_decode_end,
2002  .flush = wmavoice_flush,
2003 };
RDFTContext rdft
Definition: wmavoice.c:264
Description of frame types.
Definition: wmavoice.c:98
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply first set of pitch-adaptive window pulses.
Definition: wmavoice.c:1174
av_cold void ff_rdft_end(RDFTContext *s)
Definition: rdft.c:114
static const uint8_t wmavoice_dq_lsp16r2[0x500]
#define NULL
Definition: coverity.c:32
const char const char void * val
Definition: avisynth_c.h:771
int do_apf
whether to apply the averaged projection filter (APF)
Definition: wmavoice.c:148
const char * s
Definition: avisynth_c.h:768
#define AVERROR_INVALIDDATA
Invalid data found when processing input.
Definition: error.h:59
static int pRNG(int frame_cntr, int block_num, int block_size)
Generate a random number from frame_cntr and block_idx, which will live in the range [0...
Definition: wmavoice.c:1235
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
Set up the variable bit mode (VBM) tree from container extradata.
Definition: wmavoice.c:299
void ff_celp_lp_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP synthesis filter.
Definition: celp_filters.c:84
float gain_pred_err[6]
cache for gain prediction
Definition: wmavoice.c:249
int size
static float alpha(float a)
This structure describes decoded (raw) audio or video data.
Definition: frame.h:201
int aw_next_pulse_off_cache
the position (relative to start of the second block) at which pulses should start to be positioned...
Definition: wmavoice.c:240
int nb_superframes
number of superframes in current packet
Definition: wmavoice.c:248
static void flush(AVCodecContext *avctx)
float postfilter_agc
gain control memory, used in adaptive_gain_control()
Definition: wmavoice.c:270
void ff_acelp_apply_order_2_transfer_function(float *out, const float *in, const float zero_coeffs[2], const float pole_coeffs[2], float gain, float mem[2], int n)
Apply an order 2 rational transfer function in-place.
static void put_bits(Jpeg2000EncoderContext *s, int val, int n)
put n times val bit
Definition: j2kenc.c:206
static unsigned int get_bits(GetBitContext *s, int n)
Read 1-25 bits.
Definition: get_bits.h:262
static void postfilter(WMAVoiceContext *s, const float *synth, float *samples, int size, const float *lpcs, float *zero_exc_pf, int fcb_type, int pitch)
Averaging projection filter, the postfilter used in WMAVoice.
Definition: wmavoice.c:802
Memory handling functions.
void ff_weighted_vector_sumf(float *out, const float *in_a, const float *in_b, float weight_coeff_a, float weight_coeff_b, int length)
float implementation of weighted sum of two vectors.
static void skip_bits_long(GetBitContext *s, int n)
Definition: get_bits.h:205
static av_cold int init(AVCodecContext *avctx)
Definition: avrndec.c:35
#define INIT_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size)
Definition: vlc.h:75
#define avpriv_request_sample(...)
float synth_filter_out_buf[0x80+MAX_LSPS_ALIGN16]
aligned buffer for postfilter speech synthesis
Definition: wmavoice.c:282
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, const int *pitch)
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between th...
Definition: wmavoice.c:1032
static const int8_t pulses[4]
Number of non-zero pulses in the MP-MLQ excitation.
Definition: g723_1.h:720
int x[10]
Definition: acelp_vectors.h:55
int size
Definition: avcodec.h:1680
int aw_n_pulses[2]
number of AW-pulses in each block; note that this number can be negative (in which case it basically ...
Definition: wmavoice.c:235
static int interpol(MBContext *s, uint32_t *color, int x, int y, int linesize)
void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
Copy the content of src to the bitstream.
Definition: bitstream.c:64
static void stabilize_lsps(double *lsps, int num)
Ensure minimum value for first item, maximum value for last value, proper spacing between each value ...
Definition: wmavoice.c:1654
static const float wmavoice_gain_codebook_fcb[128]
static const uint8_t wmavoice_dq_lsp16i1[0x640]
#define a1
Definition: regdef.h:47
static const uint8_t wmavoice_dq_lsp16r1[0x500]
int spillover_nbits
number of bits of the previous packet&#39;s last superframe preceding this packet&#39;s first full superframe...
Definition: wmavoice.c:187
void ff_set_fixed_vector(float *out, const AMRFixed *in, float scale, int size)
Add fixed vector to an array from a sparse representation.
hardcoded (fixed) codebook with per-block gain values
Definition: wmavoice.c:86
int block_pitch_nbits
number of bits used to specify the first block&#39;s pitch value
Definition: wmavoice.c:166
static const uint8_t wmavoice_dq_lsp16i3[0x300]
float pitch_fac
Definition: acelp_vectors.h:59
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, float *samples, const double *lsps, const double *prev_lsps, float *excitation, float *synth)
Synthesize output samples for a single frame.
Definition: wmavoice.c:1478
adaptive codebook with per-frame pitch, which we interpolate to get a per-sample pitch.
Definition: wmavoice.c:69
static void calc_input_response(WMAVoiceContext *s, float *lpcs, int fcb_type, float *coeffs, int remainder)
Derive denoise filter coefficients (in real domain) from the LPCs.
Definition: wmavoice.c:601
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
Parse 10 independently-coded LSPs.
Definition: wmavoice.c:887
AVCodec.
Definition: avcodec.h:3739
#define MAX_LSPS_ALIGN16
same as MAX_LSPS; needs to be multiple
Definition: wmavoice.c:48
int block_align
number of bytes per packet if constant and known or 0 Used by some WAV based audio codecs...
Definition: avcodec.h:2560
static void decode(AVCodecContext *dec_ctx, AVPacket *pkt, AVFrame *frame, FILE *outfile)
Definition: decode_audio.c:42
static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply second set of pitch-adaptive window pulses.
Definition: wmavoice.c:1084
Innovation (fixed) codebook pulse sets in combinations of either single pulses or pulse pairs...
Definition: wmavoice.c:90
static const float wmavoice_ipol1_coeffs[17 *9]
static const uint8_t wmavoice_dq_lsp16i2[0x3c0]
comfort noise during silence generated from a hardcoded (fixed) codebook with per-frame (low) gain va...
Definition: wmavoice.c:83
#define AV_CODEC_CAP_DELAY
Encoder or decoder requires flushing with NULL input at the end in order to give the complete and cor...
Definition: avcodec.h:1027
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:37
int spillover_bitsize
number of bits used to specify spillover_nbits in the packet header = ceil(log2(ctx->block_align << 3...
Definition: wmavoice.c:141
int block_delta_pitch_nbits
number of bits used to specify the delta pitch between this and the last block&#39;s pitch value...
Definition: wmavoice.c:169
uint8_t bits
Definition: crc.c:296
enum AVSampleFormat sample_fmt
audio sample format
Definition: avcodec.h:2531
int mem
Definition: avisynth_c.h:821
uint8_t
#define av_cold
Definition: attributes.h:82
Sparse representation for the algebraic codebook (fixed) vector.
Definition: acelp_vectors.h:53
static const uint8_t wmavoice_dq_lsp16r3[0x600]
float delta
DCTContext dct
Definition: wmavoice.c:266
static const float wmavoice_gain_codebook_acb[128]
uint8_t log_n_blocks
log2(n_blocks)
Definition: wmavoice.c:101
int aw_first_pulse_off[2]
index of first sample to which to apply AW-pulses, or -0xff if unset
Definition: wmavoice.c:238
static av_cold int end(AVCodecContext *avctx)
Definition: avrndec.c:90
int has_residual_lsps
if set, superframes contain one set of LSPs that cover all frames, encoded as independent and residua...
Definition: wmavoice.c:191
float tilted_lpcs_pf[0x80]
aligned buffer for LPC tilting
Definition: wmavoice.c:278
uint8_t * extradata
some codecs need / can use extradata like Huffman tables.
Definition: avcodec.h:1876
static float tilt_factor(const float *lpcs, int n_lpcs)
Get the tilt factor of a formant filter from its transfer function.
Definition: wmavoice.c:588
static const uint8_t wmavoice_dq_lsp10r[0x1400]
static AVFrame * frame
const char data[16]
Definition: mxf.c:90
static void dequant_lsps(double *lsps, int num, const uint16_t *values, const uint16_t *sizes, int n_stages, const uint8_t *table, const double *mul_q, const double *base_q)
Dequantize LSPs.
Definition: wmavoice.c:856
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:104
static const float wmavoice_ipol2_coeffs[32]
Hamming-window sinc function (num = 32, x = [ 0, 31 ]): (0.54 + 0.46 * cos(2 * M_PI * x / (num - 1)))...
uint8_t * data
Definition: avcodec.h:1679
static int get_bits_count(const GetBitContext *s)
Definition: get_bits.h:200
static int flags
Definition: log.c:57
float dcf_mem[2]
DC filter history.
Definition: wmavoice.c:272
void av_memcpy_backptr(uint8_t *dst, int back, int cnt)
Overlapping memcpy() implementation.
Definition: mem.c:412
bitstream reader API header.
static av_cold void wmavoice_flush(AVCodecContext *ctx)
Definition: wmavoice.c:335
float synth_history[MAX_LSPS]
see excitation_history
Definition: wmavoice.c:254
double prev_lsps[MAX_LSPS]
LSPs of the last frame of the previous superframe.
Definition: wmavoice.c:219
static void copy_bits(PutBitContext *pb, const uint8_t *data, int size, GetBitContext *gb, int nbits)
Copy (unaligned) bits from gb/data/size to pb.
Definition: wmavoice.c:1867
#define av_log(a,...)
#define expf(x)
Definition: libm.h:283
#define U(x)
Definition: vp56_arith.h:37
static int get_bits_left(GetBitContext *gb)
Definition: get_bits.h:589
int size_in_bits
Definition: put_bits.h:39
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:176
static const double wmavoice_mean_lsf16[2][16]
int sframe_cache_size
set to >0 if we have data from an (incomplete) superframe from a previous packet that spilled over in...
Definition: wmavoice.c:203
static const float wmavoice_lsp10_intercoeff_b[32][2][10]
int block_pitch_range
range of the block pitch
Definition: wmavoice.c:168
static const float wmavoice_std_codebook[1000]
static const int sizes[][2]
Definition: img2dec.c:51
int last_acb_type
frame type [0-2] of the previous frame
Definition: wmavoice.c:222
#define AVERROR(e)
Definition: error.h:43
static const struct endianess table[]
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification. ...
Definition: internal.h:181
static const float wmavoice_gain_silence[256]
int denoise_filter_cache_size
samples in denoise_filter_cache
Definition: wmavoice.c:277
int history_nsamples
number of samples in history for signal prediction (through ACB)
Definition: wmavoice.c:144
static const uint8_t wmavoice_dq_lsp10i[0xf00]
Definition: wmavoice_data.h:33
static const float wmavoice_lsp10_intercoeff_a[32][2][10]
#define t1
Definition: regdef.h:29
static const float wmavoice_energy_table[128]
LUT for 1.071575641632 * pow(1.0331663, n - 127)
Windows Media Voice (WMAVoice) tables.
Definition: avfft.h:73
const char * name
Name of the codec implementation.
Definition: avcodec.h:3746
int no_repeat_mask
Definition: acelp_vectors.h:57
int denoise_tilt_corr
Whether to apply tilt correction to the Wiener filter coefficients (postfilter)
Definition: wmavoice.c:152
int aw_idx_is_ext
whether the AW index was encoded in 8 bits (instead of 6)
Definition: wmavoice.c:227
#define t3
Definition: regdef.h:31
no adaptive codebook (only hardcoded fixed)
Definition: wmavoice.c:68
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
#define FFMAX(a, b)
Definition: common.h:94
uint16_t block_conv_table[4]
boundaries for block pitch unit/scale conversion
Definition: wmavoice.c:175
#define MUL16(ra, rb)
Definition: mathops.h:88
DCTContext dst
contexts for phase shift (in Hilbert transform, part of postfilter)
Definition: wmavoice.c:266
int lsp_def_mode
defines different sets of LSP defaults [0, 1]
Definition: wmavoice.c:159
Definition: vlc.h:26
uint64_t channel_layout
Audio channel layout.
Definition: avcodec.h:2574
static int put_bits_count(PutBitContext *s)
Definition: put_bits.h:85
#define powf(x, y)
Definition: libm.h:50
int skip_bits_next
number of bits to skip at the next call to wmavoice_decode_packet() (since they&#39;re part of the previo...
Definition: wmavoice.c:196
static void dequant_lsp16r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:982
int min_pitch_val
base value for pitch parsing code
Definition: wmavoice.c:162
WMA Voice decoding context.
Definition: wmavoice.c:130
static void wiener_denoise(WMAVoiceContext *s, int fcb_type, float *synth_pf, int size, const float *lpcs)
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it...
Definition: wmavoice.c:720
int denoise_strength
strength of denoising in Wiener filter [0-11]
Definition: wmavoice.c:150
uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE+AV_INPUT_BUFFER_PADDING_SIZE]
cache for superframe data split over multiple packets
Definition: wmavoice.c:200
audio channel layout utility functions
Definition: avfft.h:97
#define FFMIN(a, b)
Definition: common.h:96
#define log_range(var, assign)
#define MAX_LSPS
maximum filter order
Definition: wmavoice.c:47
static VLC frame_type_vlc
Frame type VLC coding.
Definition: wmavoice.c:62
int pitch_nbits
number of bits used to specify the pitch value in the frame header
Definition: wmavoice.c:164
#define MAX_BLOCKS
maximum number of blocks per frame
Definition: wmavoice.c:46
float denoise_coeffs_pf[0x80]
aligned buffer for denoise coefficients
Definition: wmavoice.c:280
void(* dct_calc)(struct DCTContext *s, FFTSample *data)
Definition: dct.h:38
static void dequant_lsp10r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:913
float y[10]
Definition: acelp_vectors.h:56
AVFormatContext * ctx
Definition: movenc.c:48
static av_always_inline unsigned UMULH(unsigned a, unsigned b)
Definition: mathops.h:68
#define a2
Definition: regdef.h:48
Definition: dct.h:32
float sin[511]
Definition: wmavoice.c:268
static av_always_inline int get_vlc2(GetBitContext *s, VLC_TYPE(*table)[2], int bits, int max_depth)
Parse a vlc code.
Definition: get_bits.h:556
#define AV_RL32
Definition: intreadwrite.h:146
Definition: avfft.h:72
int n
Definition: avisynth_c.h:684
void(* rdft_calc)(struct RDFTContext *s, FFTSample *z)
Definition: rdft.h:38
if(ret< 0)
Definition: vf_mcdeint.c:279
static int kalman_smoothen(WMAVoiceContext *s, int pitch, const float *in, float *out, int size)
Kalman smoothing function.
Definition: wmavoice.c:542
void ff_tilt_compensation(float *mem, float tilt, float *samples, int size)
Apply tilt compensation filter, 1 - tilt * z-1.
static const float wmavoice_gain_universal[64]
void ff_acelp_lspd2lpc(const double *lsp, float *lpc, int lp_half_order)
Reconstruct LPC coefficients from the line spectral pair frequencies.
Definition: lsp.c:209
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
Set up decoder with parameters from demuxer (extradata etc.).
Definition: wmavoice.c:366
#define AVERROR_PATCHWELCOME
Not yet implemented in FFmpeg, patches welcome.
Definition: error.h:62
static const uint8_t last_coeff[3]
Definition: qdm2data.h:257
static const struct frame_type_desc frame_descs[17]
float denoise_filter_cache[MAX_FRAMESIZE]
Definition: wmavoice.c:276
Libavcodec external API header.
int sample_rate
samples per second
Definition: avcodec.h:2523
void AAC_RENAME() ff_sine_window_init(INTFLOAT *window, int n)
Generate a sine window.
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, int *got_frame_ptr, AVPacket *avpkt)
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer...
Definition: wmavoice.c:1896
main external API structure.
Definition: avcodec.h:1761
static int parse_packet_header(WMAVoiceContext *s)
Parse the packet header at the start of each packet (input data to this decoder). ...
Definition: wmavoice.c:1832
int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
Get a buffer for a frame.
Definition: decode.c:1669
AVCodec ff_wmavoice_decoder
Definition: wmavoice.c:1991
int8_t vbm_tree[25]
converts VLC codes to frame type
Definition: wmavoice.c:139
int extradata_size
Definition: avcodec.h:1877
static unsigned int get_bits1(GetBitContext *s)
Definition: get_bits.h:314
static void synth_block(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const double *lsps, const double *prev_lsps, const struct frame_type_desc *frame_desc, float *excitation, float *synth)
Parse data in a single block.
Definition: wmavoice.c:1436
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
Definition: wmavoice.c:1977
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
static void skip_bits(GetBitContext *s, int n)
Definition: get_bits.h:307
av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
Set up DCT.
Definition: dct.c:177
#define AV_CODEC_CAP_SUBFRAMES
Codec can output multiple frames per AVPacket Normally demuxers return one frame at a time...
Definition: avcodec.h:1052
int pitch_diff_sh16
((cur_pitch_val - last_pitch_val) << 16) / MAX_FRAMESIZE
Definition: wmavoice.c:223
static int init_get_bits(GetBitContext *s, const uint8_t *buffer, int bit_size)
Initialize GetBitContext.
Definition: get_bits.h:426
#define MAX_SFRAMESIZE
maximum number of samples per superframe
Definition: wmavoice.c:53
int lsp_q_mode
defines quantizer defaults [0, 1]
Definition: wmavoice.c:158
int frame_cntr
current frame index [0 - 0xFFFE]; is only used for comfort noise in pRNG()
Definition: wmavoice.c:246
void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP zero synthesis filter.
Definition: celp_filters.c:199
float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len)
Return the scalar product of two vectors.
Definition: float_dsp.c:116
#define u(width,...)
static void adaptive_gain_control(float *out, const float *in, const float *speech_synth, int size, float alpha, float *gain_mem)
Adaptive gain control (as used in postfilter).
Definition: wmavoice.c:501
Pitch-adaptive window (AW) pulse signals, used in particular for low-bitrate streams.
Definition: wmavoice.c:88
static const float mean_lsf[10]
Definition: siprdata.h:27
#define SFRAME_CACHE_MAXSIZE
maximum cache size for frame data that
Definition: wmavoice.c:55
#define av_log2_16bit
Definition: intmath.h:84
uint8_t fcb_type
Fixed codebook type (FCB_TYPE_*)
Definition: wmavoice.c:103
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
Parse 16 independently-coded LSPs.
Definition: wmavoice.c:949
RDFTContext irdft
contexts for FFT-calculation in the postfilter (for denoise filter)
Definition: wmavoice.c:264
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:215
static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, int *got_frame_ptr)
Synthesize output samples for a single superframe.
Definition: wmavoice.c:1700
#define M_LN10
Definition: mathematics.h:43
Per-block pitch with signal generation using a Hamming sinc window function.
Definition: wmavoice.c:74
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, const struct frame_type_desc *frame_desc, float *excitation)
Parse hardcoded signal for a single block.
Definition: wmavoice.c:1271
uint8_t n_blocks
amount of blocks per frame (each block (contains 160/n_blocks samples)
Definition: wmavoice.c:99
common internal api header.
static void flush_put_bits(PutBitContext *s)
Pad the end of the output stream with zeros.
Definition: put_bits.h:101
static av_cold void wmavoice_init_static_data(AVCodec *codec)
Definition: wmavoice.c:313
int pitch_lag
Definition: acelp_vectors.h:58
float excitation_history[MAX_SIGNAL_HISTORY]
cache of the signal of previous superframes, used as a history for signal generation ...
Definition: wmavoice.c:250
static void init_put_bits(PutBitContext *s, uint8_t *buffer, int buffer_size)
Initialize the PutBitContext s.
Definition: put_bits.h:48
int last_pitch_val
pitch value of the previous frame
Definition: wmavoice.c:221
#define AV_INPUT_BUFFER_PADDING_SIZE
Required number of additionally allocated bytes at the end of the input bitstream for decoding...
Definition: avcodec.h:777
void * priv_data
Definition: avcodec.h:1803
#define MAX_FRAMESIZE
maximum number of samples per frame
Definition: wmavoice.c:51
float silence_gain
set for use in blocks if ACB_TYPE_NONE
Definition: wmavoice.c:225
static const double wmavoice_mean_lsf10[2][10]
static const int16_t coeffs[]
int len
int channels
number of audio channels
Definition: avcodec.h:2524
VLC_TYPE(* table)[2]
code, bits
Definition: vlc.h:28
#define lrint
Definition: tablegen.h:53
av_cold void ff_dct_end(DCTContext *s)
Definition: dct.c:220
void ff_acelp_interpolatef(float *out, const float *in, const float *filter_coeffs, int precision, int frac_pos, int filter_length, int length)
Floating point version of ff_acelp_interpolate()
Definition: acelp_filters.c:78
int block_delta_pitch_hrange
1/2 range of the delta (full range is from -this to +this-1)
Definition: wmavoice.c:173
int max_pitch_val
max value + 1 for pitch parsing
Definition: wmavoice.c:163
#define av_uninit(x)
Definition: attributes.h:148
int lsps
number of LSPs per frame [10 or 16]
Definition: wmavoice.c:157
FILE * out
Definition: movenc.c:54
#define MAX_FRAMES
maximum number of frames per superframe
Definition: wmavoice.c:50
static const float wmavoice_lsp16_intercoeff_b[32][2][16]
PutBitContext pb
bitstream writer for sframe_cache
Definition: wmavoice.c:208
#define M_PI
Definition: mathematics.h:52
uint8_t acb_type
Adaptive codebook type (ACB_TYPE_*)
Definition: wmavoice.c:102
static const float wmavoice_denoise_power_table[12][64]
LUT for f(x,y) = pow((y + 6.9) / 64, 0.025 * (x + 1)).
int dc_level
Predicted amount of DC noise, based on which a DC removal filter is used.
Definition: wmavoice.c:154
#define VLC_NBITS
number of bits to read per VLC iteration
Definition: wmavoice.c:57
static const float wmavoice_lsp16_intercoeff_a[32][2][16]
Definition: avfft.h:96
float cos[511]
8-bit cosine/sine windows over [-pi,pi] range
Definition: wmavoice.c:268
#define AV_CH_LAYOUT_MONO
av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
Set up a real FFT.
Definition: rdft.c:88
int aw_pulse_range
the range over which aw_pulse_set1() can apply the pulse, relative to the value in aw_first_pulse_off...
Definition: wmavoice.c:229
float min
This structure stores compressed data.
Definition: avcodec.h:1656
int nb_samples
number of audio samples (per channel) described by this frame
Definition: frame.h:267
float zero_exc_pf[MAX_SIGNAL_HISTORY+MAX_SFRAMESIZE]
zero filter output (i.e.
Definition: wmavoice.c:273
#define AV_CODEC_CAP_DR1
Codec uses get_buffer() for allocating buffers and supports custom allocators.
Definition: avcodec.h:1002
for(j=16;j >0;--j)
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const struct frame_type_desc *frame_desc, float *excitation)
Parse FCB/ACB signal for a single block.
Definition: wmavoice.c:1302
uint8_t dbl_pulses
how many pulse vectors have pulse pairs (rather than just one single pulse) only if fcb_type == FCB_T...
Definition: wmavoice.c:104
#define t2
Definition: regdef.h:30
#define MAX_SIGNAL_HISTORY
maximum excitation signal history
Definition: wmavoice.c:52
#define MULH
Definition: mathops.h:42
GetBitContext gb
packet bitreader.
Definition: wmavoice.c:135
static uint8_t tmp[11]
Definition: aes_ctr.c:26
bitstream writer API