ffmpeg 视音频同步

原文地址: https://blog.csdn.net/nonmarking/article/details/50522413

对于直播流来说, 只考虑发送端的同步问题, 原理如下: 1. 解析视音频, 讲视频流和音频流的时间戳用同样的时间基准表示 2. 比较转换后的两个时间戳, 找出较小值, 对应发送偏慢的流 3. 读取, 转码, 发送相应的流, 同时, 若该流的转码时间很快, 超前于wall clock, 则还需要进行相应的延时 4. 重复以上过程

下文包括两部分, 一是音频转码部分, 二是视音频同步

音频转码基本流程

首先是一些音频输入输出的基本设置

//Set own audio device's name
    if (avformat_open_input(&ifmt_ctx_a, device_name_a, ifmt, &device_param) != 0){

        printf("Couldn't open input audio stream.(无法打开输入流)\n");
        return -1;
    }
……
//input audio initialize
    if (avformat_find_stream_info(ifmt_ctx_a, NULL) < 0)
    {
        printf("Couldn't find audio stream information.(无法获取流信息)\n");
        return -1;
    }
    audioindex = -1;
    for (i = 0; i < ifmt_ctx_a->nb_streams; i++)
    if (ifmt_ctx_a->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO)
    {
        audioindex = i;
        break;
    }
    if (audioindex == -1)
    {
        printf("Couldn't find a audio stream.(没有找到视频流)\n");
        return -1;
    }
    if (avcodec_open2(ifmt_ctx_a->streams[audioindex]->codec, avcodec_find_decoder(ifmt_ctx_a->streams[audioindex]->codec->codec_id), NULL) < 0)
    {
        printf("Could not open audio codec.(无法打开解码器)\n");
        return -1;
    }
……
 //output audio encoder initialize
    pCodec_a = avcodec_find_encoder(AV_CODEC_ID_AAC);
    if (!pCodec_a){
        printf("Can not find output audio encoder! (没有找到合适的编码器!)\n");
        return -1;
    }
    pCodecCtx_a = avcodec_alloc_context3(pCodec_a);
    pCodecCtx_a->channels = 2;
    pCodecCtx_a->channel_layout = av_get_default_channel_layout(2);
    pCodecCtx_a->sample_rate = ifmt_ctx_a->streams[audioindex]->codec->sample_rate;
    pCodecCtx_a->sample_fmt = pCodec_a->sample_fmts[0];
    pCodecCtx_a->bit_rate = 32000;
    pCodecCtx_a->time_base.num = 1;
    pCodecCtx_a->time_base.den = pCodecCtx_a->sample_rate;
    /** Allow the use of the experimental AAC encoder */
    pCodecCtx_a->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
    /* Some formats want stream headers to be separate. */
    if (ofmt_ctx->oformat->flags & AVFMT_GLOBALHEADER)
        pCodecCtx_a->flags |= CODEC_FLAG_GLOBAL_HEADER;
    if (avcodec_open2(pCodecCtx_a, pCodec_a, NULL) < 0){
        printf("Failed to open ouput audio encoder! (编码器打开失败!)\n");
        return -1;
    }

    //Add a new stream to output,should be called by the user before avformat_write_header() for muxing
    audio_st = avformat_new_stream(ofmt_ctx, pCodec_a);
    if (audio_st == NULL){
        return -1;
    }
    audio_st->time_base.num = 1;
    audio_st->time_base.den = pCodecCtx_a->sample_rate;
    audio_st->codec = pCodecCtx_a;

接下来, 考虑到输入音频的sample format 可能需要进行转换, 需要用到swresample库的功能 先做好相应的初始化

// Initialize the resampler to be able to convert audio sample formats
    aud_convert_ctx = swr_alloc_set_opts(NULL,
        av_get_default_channel_layout(pCodecCtx_a->channels),
        pCodecCtx_a->sample_fmt,
        pCodecCtx_a->sample_rate,
        av_get_default_channel_layout(ifmt_ctx_a->streams[audioindex]->codec->channels),
        ifmt_ctx_a->streams[audioindex]->codec->sample_fmt,
        ifmt_ctx_a->streams[audioindex]->codec->sample_rate,
        0, NULL);
swr_init(aud_convert_ctx);

此外, 参照transcode_aac.c的做法, 使用FIFO buffer存储从输入端解码得到的音频采样数据, 这些数据在后续将转换sample format并进行编码, 由此即完成了一个音频转码功.

此外, 还需要另外的一个buffer来存储转换合适之后的音频数据

//Initialize the FIFO buffer to store audio samples to be encoded. 
    AVAudioFifo *fifo = NULL;
    fifo = av_audio_fifo_alloc(pCodecCtx_a->sample_fmt, pCodecCtx_a->channels, 1);

    //Initialize the buffer to store converted samples to be encoded.
    uint8_t **converted_input_samples = NULL;
    /**
    * Allocate as many pointers as there are audio channels.
    * Each pointer will later point to the audio samples of the corresponding
    * channels (although it may be NULL for interleaved formats).
    */
    if (!(converted_input_samples = (uint8_t**)calloc(pCodecCtx_a->channels,
        sizeof(**converted_input_samples)))) {
        printf("Could not allocate converted input sample pointers\n");
        return AVERROR(ENOMEM);
    }

至此, 一些基本的初始化工作完成.

音频计算pts的方法和视频类似. 即先通过sample rate算出每两个音频sample之间的时间间隔, 再通过计数当前已编码的音频sample总数(nb_samples变量的作用) 来算出当前编码音频帧的时间戳. 如果和视频的流程做类比, 大概为: framerate 相当于sample rate, framecnt相当于nb_samples.

//audio trancoding here
        const int output_frame_size = pCodecCtx_a->frame_size;

        /**
        * Make sure that there is one frame worth of samples in the FIFO
        * buffer so that the encoder can do its work.
        * Since the decoder's and the encoder's frame size may differ, we
        * need to FIFO buffer to store as many frames worth of input samples
        * that they make up at least one frame worth of output samples.
        */
        while (av_audio_fifo_size(fifo) < output_frame_size) {
            /**
            * Decode one frame worth of audio samples, convert it to the
            * output sample format and put it into the FIFO buffer.
            */
            AVFrame *input_frame = av_frame_alloc();
            if (!input_frame)
            {
                ret = AVERROR(ENOMEM);
                return ret;
            }           

            /** Decode one frame worth of audio samples. */
            /** Packet used for temporary storage. */
            AVPacket input_packet;
            av_init_packet(&input_packet);
            input_packet.data = NULL;
            input_packet.size = 0;

            /** Read one audio frame from the input file into a temporary packet. */
            if ((ret = av_read_frame(ifmt_ctx_a, &input_packet)) < 0) {
                /** If we are at the end of the file, flush the decoder below. */
                if (ret == AVERROR_EOF)
                {
                    encode_audio = 0;
                }
                else
                {
                    printf("Could not read audio frame\n");
                    return ret;
                }                   
            }

            /**
            * Decode the audio frame stored in the temporary packet.
            * The input audio stream decoder is used to do this.
            * If we are at the end of the file, pass an empty packet to the decoder
            * to flush it.
            */
            if ((ret = avcodec_decode_audio4(ifmt_ctx_a->streams[audioindex]->codec, input_frame,
                &dec_got_frame_a, &input_packet)) < 0) {
                printf("Could not decode audio frame\n");
                return ret;
            }
            av_packet_unref(&input_packet);
            /** If there is decoded data, convert and store it */
            if (dec_got_frame_a) {
                /**
                * Allocate memory for the samples of all channels in one consecutive
                * block for convenience.
                */
                if ((ret = av_samples_alloc(converted_input_samples, NULL,
                    pCodecCtx_a->channels,
                    input_frame->nb_samples,
                    pCodecCtx_a->sample_fmt, 0)) < 0) {
                    printf("Could not allocate converted input samples\n");
                    av_freep(&(*converted_input_samples)[0]);
                    free(*converted_input_samples);
                    return ret;
                }

                /**
                * Convert the input samples to the desired output sample format.
                * This requires a temporary storage provided by converted_input_samples.
                */
                /** Convert the samples using the resampler. */
                if ((ret = swr_convert(aud_convert_ctx,
                    converted_input_samples, input_frame->nb_samples,
                    (const uint8_t**)input_frame->extended_data, input_frame->nb_samples)) < 0) {
                    printf("Could not convert input samples\n");
                    return ret;
                }

                /** Add the converted input samples to the FIFO buffer for later processing. */
                /**
                * Make the FIFO as large as it needs to be to hold both,
                * the old and the new samples.
                */
                if ((ret = av_audio_fifo_realloc(fifo, av_audio_fifo_size(fifo) + input_frame->nb_samples)) < 0) {
                    printf("Could not reallocate FIFO\n");
                    return ret;
                }

                /** Store the new samples in the FIFO buffer. */
                if (av_audio_fifo_write(fifo, (void **)converted_input_samples,
                    input_frame->nb_samples) < input_frame->nb_samples) {
                    printf("Could not write data to FIFO\n");
                    return AVERROR_EXIT;
                }               
            }
        }

        /**
        * If we have enough samples for the encoder, we encode them.
        * At the end of the file, we pass the remaining samples to
        * the encoder.
        */
        if (av_audio_fifo_size(fifo) >= output_frame_size)
            /**
            * Take one frame worth of audio samples from the FIFO buffer,
            * encode it and write it to the output file.
            */
        {
            /** Temporary storage of the output samples of the frame written to the file. */
            AVFrame *output_frame=av_frame_alloc();
            if (!output_frame)
            {
                ret = AVERROR(ENOMEM);
                return ret;
            }
            /**
            * Use the maximum number of possible samples per frame.
            * If there is less than the maximum possible frame size in the FIFO
            * buffer use this number. Otherwise, use the maximum possible frame size
            */
            const int frame_size = FFMIN(av_audio_fifo_size(fifo),
                pCodecCtx_a->frame_size);

            /** Initialize temporary storage for one output frame. */
            /**
            * Set the frame's parameters, especially its size and format.
            * av_frame_get_buffer needs this to allocate memory for the
            * audio samples of the frame.
            * Default channel layouts based on the number of channels
            * are assumed for simplicity.
            */
            output_frame->nb_samples = frame_size;
            output_frame->channel_layout = pCodecCtx_a->channel_layout;
            output_frame->format = pCodecCtx_a->sample_fmt;
            output_frame->sample_rate = pCodecCtx_a->sample_rate;

            /**
            * Allocate the samples of the created frame. This call will make
            * sure that the audio frame can hold as many samples as specified.
            */
            if ((ret = av_frame_get_buffer(output_frame, 0)) < 0) {
                printf("Could not allocate output frame samples\n");
                av_frame_free(&output_frame);
                return ret;
            }

            /**
            * Read as many samples from the FIFO buffer as required to fill the frame.
            * The samples are stored in the frame temporarily.
            */
            if (av_audio_fifo_read(fifo, (void **)output_frame->data, frame_size) < frame_size) {
                printf("Could not read data from FIFO\n");
                return AVERROR_EXIT;
            }

            /** Encode one frame worth of audio samples. */
            /** Packet used for temporary storage. */
            AVPacket output_packet;
            av_init_packet(&output_packet);
            output_packet.data = NULL;
            output_packet.size = 0;

            /** Set a timestamp based on the sample rate for the container. */
            if (output_frame) {
                nb_samples += output_frame->nb_samples;
            }

            /**
            * Encode the audio frame and store it in the temporary packet.
            * The output audio stream encoder is used to do this.
            */
            if ((ret = avcodec_encode_audio2(pCodecCtx_a, &output_packet,
                output_frame, &enc_got_frame_a)) < 0) {
                printf("Could not encode frame\n");
                av_packet_unref(&output_packet);
                return ret;
            }

            /** Write one audio frame from the temporary packet to the output file. */
            if (enc_got_frame_a) {

                output_packet.stream_index = 1;

                AVRational time_base = ofmt_ctx->streams[1]->time_base;
                AVRational r_framerate1 = { ifmt_ctx_a->streams[audioindex]->codec->sample_rate, 1 };// { 44100, 1};  
                int64_t calc_duration = (double)(AV_TIME_BASE)*(1 / av_q2d(r_framerate1));  //内部时间戳  

                output_packet.pts = av_rescale_q(nb_samples*calc_duration, time_base_q, time_base);
                output_packet.dts = output_packet.pts;
                output_packet.duration = output_frame->nb_samples;

                //printf("audio pts : %d\n", output_packet.pts);
                aud_next_pts = nb_samples*calc_duration;

                int64_t pts_time = av_rescale_q(output_packet.pts, time_base, time_base_q);
                int64_t now_time = av_gettime() - start_time;

                if ((pts_time > now_time) && ((aud_next_pts + pts_time - now_time)<vid_next_pts))
                    av_usleep(pts_time - now_time);

                if ((ret = av_interleaved_write_frame(ofmt_ctx, &output_packet)) < 0) {
                    printf("Could not write frame\n");
                    av_packet_unref(&output_packet);
                    return ret;
                }

                av_packet_unref(&output_packet);
            }           
            av_frame_free(&output_frame);       
        }     
视音频同步

首先定义几个变量

    int aud_next_pts = 0;//视频流目前的pts,可以理解为目前的进度
    int vid_next_pts = 0;//音频流目前的pts
    int encode_video = 1, encode_audio = 1;//是否要编码视频、音频

则相应的视音频同步方法如下: 1. 确定视频, 音频二者中至少有一个是需要进行转码的 2. 比较两个流的进度, 使用av_compare_ts函数, 注意:此时的vid_next_pts和aud_next_pts的time base都是ffmpeg内部基准,即AVRational time_base_q = { 1, AV_TIME_BASE }; 3. 对进度落后的流进行转码, 并相应地对进度进行更新. 对于视频,有 vid_next_pts=framecnt_calc_duration;,对于音频,有 aud_next_pts = nb_samples_calc_duration;这里framecnt和nb_samples都相当于计数器,而calc_duration是对应流每两个frame或sample之间的时间间隔,也是以ffmpeg内部时间基准为单位的 4. 若转码进度很快完成, 则不能急于写入输出流, 而是需要先进行延时, 但是也要保证延时后的时间不会超过另一个流的进度

综上, 流程如下:

 //start decode and encode
    int64_t start_time = av_gettime();
    while (encode_video || encode_audio)
    {
        if (encode_video &&
            (!encode_audio || av_compare_ts(vid_next_pts, time_base_q,
            aud_next_pts, time_base_q) <= 0))
        {
              进行视频转码;
              转码完成后;
              vid_next_pts=framecnt*calc_duration; //general timebase

                        //Delay
                        int64_t pts_time = av_rescale_q(enc_pkt.pts, time_base, time_base_q);
                        int64_t now_time = av_gettime() - start_time;                       
                        if ((pts_time > now_time) && ((vid_next_pts + pts_time - now_time)<aud_next_pts))
                            av_usleep(pts_time - now_time);
              写入流;
}
else
{
              进行音频转码;
              转码完成后;
          aud_next_pts = nb_samples*calc_duration;

                int64_t pts_time = av_rescale_q(output_packet.pts, time_base, time_base_q);
                int64_t now_time = av_gettime() - start_time;
                if ((pts_time > now_time) && ((aud_next_pts + pts_time - now_time)<vid_next_pts))
                    av_usleep(pts_time - now_time);
              写入流;
}

至此, 视音频同步完成. 最后再完成一些flush_encoder的工作即可.

Licensed under CC BY-NC-SA 4.0
Built with Hugo
主题 StackJimmy 设计