原文地址: https://blog.csdn.net/nonmarking/article/details/50522413
对于直播流来说, 只考虑发送端的同步问题, 原理如下: 1. 解析视音频, 讲视频流和音频流的时间戳用同样的时间基准表示 2. 比较转换后的两个时间戳, 找出较小值, 对应发送偏慢的流 3. 读取, 转码, 发送相应的流, 同时, 若该流的转码时间很快, 超前于wall clock, 则还需要进行相应的延时 4. 重复以上过程
下文包括两部分, 一是音频转码部分, 二是视音频同步
音频转码基本流程
首先是一些音频输入输出的基本设置
//Set own audio device's name
if (avformat_open_input(&ifmt_ctx_a, device_name_a, ifmt, &device_param) != 0){
printf("Couldn't open input audio stream.(无法打开输入流)\n");
return -1;
}
……
//input audio initialize
if (avformat_find_stream_info(ifmt_ctx_a, NULL) < 0)
{
printf("Couldn't find audio stream information.(无法获取流信息)\n");
return -1;
}
audioindex = -1;
for (i = 0; i < ifmt_ctx_a->nb_streams; i++)
if (ifmt_ctx_a->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO)
{
audioindex = i;
break;
}
if (audioindex == -1)
{
printf("Couldn't find a audio stream.(没有找到视频流)\n");
return -1;
}
if (avcodec_open2(ifmt_ctx_a->streams[audioindex]->codec, avcodec_find_decoder(ifmt_ctx_a->streams[audioindex]->codec->codec_id), NULL) < 0)
{
printf("Could not open audio codec.(无法打开解码器)\n");
return -1;
}
……
//output audio encoder initialize
pCodec_a = avcodec_find_encoder(AV_CODEC_ID_AAC);
if (!pCodec_a){
printf("Can not find output audio encoder! (没有找到合适的编码器!)\n");
return -1;
}
pCodecCtx_a = avcodec_alloc_context3(pCodec_a);
pCodecCtx_a->channels = 2;
pCodecCtx_a->channel_layout = av_get_default_channel_layout(2);
pCodecCtx_a->sample_rate = ifmt_ctx_a->streams[audioindex]->codec->sample_rate;
pCodecCtx_a->sample_fmt = pCodec_a->sample_fmts[0];
pCodecCtx_a->bit_rate = 32000;
pCodecCtx_a->time_base.num = 1;
pCodecCtx_a->time_base.den = pCodecCtx_a->sample_rate;
/** Allow the use of the experimental AAC encoder */
pCodecCtx_a->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
/* Some formats want stream headers to be separate. */
if (ofmt_ctx->oformat->flags & AVFMT_GLOBALHEADER)
pCodecCtx_a->flags |= CODEC_FLAG_GLOBAL_HEADER;
if (avcodec_open2(pCodecCtx_a, pCodec_a, NULL) < 0){
printf("Failed to open ouput audio encoder! (编码器打开失败!)\n");
return -1;
}
//Add a new stream to output,should be called by the user before avformat_write_header() for muxing
audio_st = avformat_new_stream(ofmt_ctx, pCodec_a);
if (audio_st == NULL){
return -1;
}
audio_st->time_base.num = 1;
audio_st->time_base.den = pCodecCtx_a->sample_rate;
audio_st->codec = pCodecCtx_a;
接下来, 考虑到输入音频的sample format 可能需要进行转换, 需要用到swresample库的功能 先做好相应的初始化
// Initialize the resampler to be able to convert audio sample formats
aud_convert_ctx = swr_alloc_set_opts(NULL,
av_get_default_channel_layout(pCodecCtx_a->channels),
pCodecCtx_a->sample_fmt,
pCodecCtx_a->sample_rate,
av_get_default_channel_layout(ifmt_ctx_a->streams[audioindex]->codec->channels),
ifmt_ctx_a->streams[audioindex]->codec->sample_fmt,
ifmt_ctx_a->streams[audioindex]->codec->sample_rate,
0, NULL);
swr_init(aud_convert_ctx);
此外, 参照transcode_aac.c的做法, 使用FIFO buffer存储从输入端解码得到的音频采样数据, 这些数据在后续将转换sample format并进行编码, 由此即完成了一个音频转码功.
此外, 还需要另外的一个buffer来存储转换合适之后的音频数据
//Initialize the FIFO buffer to store audio samples to be encoded.
AVAudioFifo *fifo = NULL;
fifo = av_audio_fifo_alloc(pCodecCtx_a->sample_fmt, pCodecCtx_a->channels, 1);
//Initialize the buffer to store converted samples to be encoded.
uint8_t **converted_input_samples = NULL;
/**
* Allocate as many pointers as there are audio channels.
* Each pointer will later point to the audio samples of the corresponding
* channels (although it may be NULL for interleaved formats).
*/
if (!(converted_input_samples = (uint8_t**)calloc(pCodecCtx_a->channels,
sizeof(**converted_input_samples)))) {
printf("Could not allocate converted input sample pointers\n");
return AVERROR(ENOMEM);
}
至此, 一些基本的初始化工作完成.
音频计算pts的方法和视频类似. 即先通过sample rate算出每两个音频sample之间的时间间隔, 再通过计数当前已编码的音频sample总数(nb_samples变量的作用) 来算出当前编码音频帧的时间戳. 如果和视频的流程做类比, 大概为: framerate 相当于sample rate, framecnt相当于nb_samples.
//audio trancoding here
const int output_frame_size = pCodecCtx_a->frame_size;
/**
* Make sure that there is one frame worth of samples in the FIFO
* buffer so that the encoder can do its work.
* Since the decoder's and the encoder's frame size may differ, we
* need to FIFO buffer to store as many frames worth of input samples
* that they make up at least one frame worth of output samples.
*/
while (av_audio_fifo_size(fifo) < output_frame_size) {
/**
* Decode one frame worth of audio samples, convert it to the
* output sample format and put it into the FIFO buffer.
*/
AVFrame *input_frame = av_frame_alloc();
if (!input_frame)
{
ret = AVERROR(ENOMEM);
return ret;
}
/** Decode one frame worth of audio samples. */
/** Packet used for temporary storage. */
AVPacket input_packet;
av_init_packet(&input_packet);
input_packet.data = NULL;
input_packet.size = 0;
/** Read one audio frame from the input file into a temporary packet. */
if ((ret = av_read_frame(ifmt_ctx_a, &input_packet)) < 0) {
/** If we are at the end of the file, flush the decoder below. */
if (ret == AVERROR_EOF)
{
encode_audio = 0;
}
else
{
printf("Could not read audio frame\n");
return ret;
}
}
/**
* Decode the audio frame stored in the temporary packet.
* The input audio stream decoder is used to do this.
* If we are at the end of the file, pass an empty packet to the decoder
* to flush it.
*/
if ((ret = avcodec_decode_audio4(ifmt_ctx_a->streams[audioindex]->codec, input_frame,
&dec_got_frame_a, &input_packet)) < 0) {
printf("Could not decode audio frame\n");
return ret;
}
av_packet_unref(&input_packet);
/** If there is decoded data, convert and store it */
if (dec_got_frame_a) {
/**
* Allocate memory for the samples of all channels in one consecutive
* block for convenience.
*/
if ((ret = av_samples_alloc(converted_input_samples, NULL,
pCodecCtx_a->channels,
input_frame->nb_samples,
pCodecCtx_a->sample_fmt, 0)) < 0) {
printf("Could not allocate converted input samples\n");
av_freep(&(*converted_input_samples)[0]);
free(*converted_input_samples);
return ret;
}
/**
* Convert the input samples to the desired output sample format.
* This requires a temporary storage provided by converted_input_samples.
*/
/** Convert the samples using the resampler. */
if ((ret = swr_convert(aud_convert_ctx,
converted_input_samples, input_frame->nb_samples,
(const uint8_t**)input_frame->extended_data, input_frame->nb_samples)) < 0) {
printf("Could not convert input samples\n");
return ret;
}
/** Add the converted input samples to the FIFO buffer for later processing. */
/**
* Make the FIFO as large as it needs to be to hold both,
* the old and the new samples.
*/
if ((ret = av_audio_fifo_realloc(fifo, av_audio_fifo_size(fifo) + input_frame->nb_samples)) < 0) {
printf("Could not reallocate FIFO\n");
return ret;
}
/** Store the new samples in the FIFO buffer. */
if (av_audio_fifo_write(fifo, (void **)converted_input_samples,
input_frame->nb_samples) < input_frame->nb_samples) {
printf("Could not write data to FIFO\n");
return AVERROR_EXIT;
}
}
}
/**
* If we have enough samples for the encoder, we encode them.
* At the end of the file, we pass the remaining samples to
* the encoder.
*/
if (av_audio_fifo_size(fifo) >= output_frame_size)
/**
* Take one frame worth of audio samples from the FIFO buffer,
* encode it and write it to the output file.
*/
{
/** Temporary storage of the output samples of the frame written to the file. */
AVFrame *output_frame=av_frame_alloc();
if (!output_frame)
{
ret = AVERROR(ENOMEM);
return ret;
}
/**
* Use the maximum number of possible samples per frame.
* If there is less than the maximum possible frame size in the FIFO
* buffer use this number. Otherwise, use the maximum possible frame size
*/
const int frame_size = FFMIN(av_audio_fifo_size(fifo),
pCodecCtx_a->frame_size);
/** Initialize temporary storage for one output frame. */
/**
* Set the frame's parameters, especially its size and format.
* av_frame_get_buffer needs this to allocate memory for the
* audio samples of the frame.
* Default channel layouts based on the number of channels
* are assumed for simplicity.
*/
output_frame->nb_samples = frame_size;
output_frame->channel_layout = pCodecCtx_a->channel_layout;
output_frame->format = pCodecCtx_a->sample_fmt;
output_frame->sample_rate = pCodecCtx_a->sample_rate;
/**
* Allocate the samples of the created frame. This call will make
* sure that the audio frame can hold as many samples as specified.
*/
if ((ret = av_frame_get_buffer(output_frame, 0)) < 0) {
printf("Could not allocate output frame samples\n");
av_frame_free(&output_frame);
return ret;
}
/**
* Read as many samples from the FIFO buffer as required to fill the frame.
* The samples are stored in the frame temporarily.
*/
if (av_audio_fifo_read(fifo, (void **)output_frame->data, frame_size) < frame_size) {
printf("Could not read data from FIFO\n");
return AVERROR_EXIT;
}
/** Encode one frame worth of audio samples. */
/** Packet used for temporary storage. */
AVPacket output_packet;
av_init_packet(&output_packet);
output_packet.data = NULL;
output_packet.size = 0;
/** Set a timestamp based on the sample rate for the container. */
if (output_frame) {
nb_samples += output_frame->nb_samples;
}
/**
* Encode the audio frame and store it in the temporary packet.
* The output audio stream encoder is used to do this.
*/
if ((ret = avcodec_encode_audio2(pCodecCtx_a, &output_packet,
output_frame, &enc_got_frame_a)) < 0) {
printf("Could not encode frame\n");
av_packet_unref(&output_packet);
return ret;
}
/** Write one audio frame from the temporary packet to the output file. */
if (enc_got_frame_a) {
output_packet.stream_index = 1;
AVRational time_base = ofmt_ctx->streams[1]->time_base;
AVRational r_framerate1 = { ifmt_ctx_a->streams[audioindex]->codec->sample_rate, 1 };// { 44100, 1};
int64_t calc_duration = (double)(AV_TIME_BASE)*(1 / av_q2d(r_framerate1)); //内部时间戳
output_packet.pts = av_rescale_q(nb_samples*calc_duration, time_base_q, time_base);
output_packet.dts = output_packet.pts;
output_packet.duration = output_frame->nb_samples;
//printf("audio pts : %d\n", output_packet.pts);
aud_next_pts = nb_samples*calc_duration;
int64_t pts_time = av_rescale_q(output_packet.pts, time_base, time_base_q);
int64_t now_time = av_gettime() - start_time;
if ((pts_time > now_time) && ((aud_next_pts + pts_time - now_time)<vid_next_pts))
av_usleep(pts_time - now_time);
if ((ret = av_interleaved_write_frame(ofmt_ctx, &output_packet)) < 0) {
printf("Could not write frame\n");
av_packet_unref(&output_packet);
return ret;
}
av_packet_unref(&output_packet);
}
av_frame_free(&output_frame);
}
视音频同步
首先定义几个变量
int aud_next_pts = 0;//视频流目前的pts,可以理解为目前的进度
int vid_next_pts = 0;//音频流目前的pts
int encode_video = 1, encode_audio = 1;//是否要编码视频、音频
则相应的视音频同步方法如下: 1. 确定视频, 音频二者中至少有一个是需要进行转码的 2. 比较两个流的进度, 使用av_compare_ts函数, 注意:此时的vid_next_pts和aud_next_pts的time base都是ffmpeg内部基准,即AVRational time_base_q = { 1, AV_TIME_BASE };
3. 对进度落后的流进行转码, 并相应地对进度进行更新. 对于视频,有 vid_next_pts=framecnt_calc_duration;,对于音频,有 aud_next_pts = nb_samples_calc_duration;这里framecnt和nb_samples都相当于计数器,而calc_duration是对应流每两个frame或sample之间的时间间隔,也是以ffmpeg内部时间基准为单位的 4. 若转码进度很快完成, 则不能急于写入输出流, 而是需要先进行延时, 但是也要保证延时后的时间不会超过另一个流的进度
综上, 流程如下:
//start decode and encode
int64_t start_time = av_gettime();
while (encode_video || encode_audio)
{
if (encode_video &&
(!encode_audio || av_compare_ts(vid_next_pts, time_base_q,
aud_next_pts, time_base_q) <= 0))
{
进行视频转码;
转码完成后;
vid_next_pts=framecnt*calc_duration; //general timebase
//Delay
int64_t pts_time = av_rescale_q(enc_pkt.pts, time_base, time_base_q);
int64_t now_time = av_gettime() - start_time;
if ((pts_time > now_time) && ((vid_next_pts + pts_time - now_time)<aud_next_pts))
av_usleep(pts_time - now_time);
写入流;
}
else
{
进行音频转码;
转码完成后;
aud_next_pts = nb_samples*calc_duration;
int64_t pts_time = av_rescale_q(output_packet.pts, time_base, time_base_q);
int64_t now_time = av_gettime() - start_time;
if ((pts_time > now_time) && ((aud_next_pts + pts_time - now_time)<vid_next_pts))
av_usleep(pts_time - now_time);
写入流;
}
至此, 视音频同步完成. 最后再完成一些flush_encoder的工作即可.