From 5c2b801a45fd2f5f291020b5d05b78001ae2e1dc Mon Sep 17 00:00:00 2001
From: jojo61 <joachim.koenig@gmx.net>
Date: Thu, 22 Aug 2019 12:34:29 +0200
Subject: [PATCH] First support for VAAPI

---
 Makefile        | 118 ++++---
 README.md       |   8 +-
 codec.c         | 179 ++++++++--
 codec.h         |   8 +-
 drirc           |  28 ++
 openglosd.cpp   | 104 +++++-
 softhdcuvid.cpp |  42 ++-
 video.c         | 861 +++++++++++++++++++++++++++++++++++++-----------
 8 files changed, 1058 insertions(+), 290 deletions(-)
 create mode 100644 drirc

diff --git a/Makefile b/Makefile
index 702f6ef..bf0b343 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 #
 # Makefile for a Video Disk Recorder plugin
-#
+# 
 # $Id: 2a41981a57e5e83036463c6a08c84b86ed9d2be3 $
 
 # The official name of this plugin.
@@ -11,35 +11,46 @@ PLUGIN = softhdcuvid
 
 ### Configuration (edit this for your needs)
 
+# what kind of driver do we make - 
+# if VAAPI is enabled the drivername is softhdvaapi
+# if CUVID is enabled the drivername is softhdcuvid
+#VAAPI=1
+CUVID=1
+
+
+# support OPENGLOSD - only configurable with cuvid
+OPENGLOSD=1
+
+# use Libplacebo - only configurable with cuvid
+LIBPLACEBO=1
+
+# use YADIF deint - only configurable with cuvid
+YADIF=0
+
+#--------------------- no more config needed past this point--------------------------------
     # support alsa audio output module
 ALSA ?= $(shell pkg-config --exists alsa && echo 1)
     # support OSS audio output module
 OSS ?= 1
-    # support OPENGLOSD 
-OPENGLOSD=1
 
-    # use Libplacebo 
-LIBPLACEBO=0
-
-	# use DMPS
+# use DMPS
 SCREENSAVER=1
 
 OPENGL=1
-    # use ffmpeg libswresample
-#SWRESAMPLE ?= $(shell pkg-config --exists libswresample && echo 1)
-SWRESAMPLE = 1
-    # use libav libavresample
-ifneq ($(SWRESAMPLE),1)
-AVRESAMPLE ?= $(shell pkg-config --exists libavresample && echo 1)
-AVRESAMPLE = 0
-endif
 
-CONFIG :=  #-DDEBUG #-DOSD_DEBUG	# enable debug output+functions
-CONFIG += -DCUVID			# enable CUVID decoder
-#CONFIG += -DYADIF			# enable yadif_cuda deinterlacer
+# use ffmpeg libswresample
+SWRESAMPLE ?= $(shell pkg-config --exists libswresample && echo 1)
+SWRESAMPLE = 1
+
+# use libav libavresample
+#ifneq ($(SWRESAMPLE),1)
+#AVRESAMPLE ?= $(shell pkg-config --exists libavresample && echo 1#)
+#AVRESAMPLE = 1
+#endif
+
+CONFIG :=  -DDEBUG #-DOSD_DEBUG	# enable debug output+functions
 CONFIG += -DHAVE_GL			# needed for mpv libs
 #CONFIG += -DSTILL_DEBUG=2		# still picture debug verbose level
-
 CONFIG += -DAV_INFO -DAV_INFO_TIME=3000	# info/debug a/v sync
 CONFIG += -DUSE_PIP			# PIP support
 #CONFIG += -DHAVE_PTHREAD_NAME		# supports new pthread_setname_np
@@ -54,8 +65,9 @@ CONFIG += -DUSE_VDR_SPU			# use VDR SPU decoder.
 
 ### The version number of this plugin (taken from the main source file):
 
-VERSION = $(shell grep 'static const char \*const VERSION *=' $(PLUGIN).cpp | awk '{ print $$7 }' | sed -e 's/[";]//g')
+VERSION = $(shell grep 'static const char \*const VERSION *=' softhdcuvid.cpp | awk '{ print $$7 }' | sed -e 's/[";]//g')
 GIT_REV = $(shell git describe --always 2>/dev/null)
+### The name of the distribution archive:
 
 ### The directory environment:
 
@@ -87,14 +99,7 @@ APIVERSION = $(call PKGCFG,apiversion)
 
 -include $(PLGCFG)
 
-### The name of the distribution archive:
 
-ARCHIVE = $(PLUGIN)-$(VERSION)
-PACKAGE = vdr-$(ARCHIVE)
-
-### The name of the shared object file:
-
-SOFILE = libvdr-$(PLUGIN).so
 
 ### Parse softhddevice config
 
@@ -103,16 +108,20 @@ CONFIG += -DUSE_ALSA
 _CFLAGS += $(shell pkg-config --cflags alsa)
 LIBS += $(shell pkg-config --libs alsa)
 endif
+
 ifeq ($(OSS),1)
 CONFIG += -DUSE_OSS
 endif
+
 ifeq ($(OPENGL),1)
 _CFLAGS += $(shell pkg-config --cflags libva-glx)
 LIBS += $(shell pkg-config --libs libva-glx)
 endif
+
 ifeq ($(OPENGLOSD),1)
 CONFIG += -DUSE_OPENGLOSD
 endif
+
 ifeq ($(OPENGL),1)
 CONFIG += -DUSE_GLX
 _CFLAGS += $(shell pkg-config --cflags gl glu glew)
@@ -122,18 +131,43 @@ LIBS += $(shell pkg-config --libs glew)
 _CFLAGS += $(shell pkg-config --cflags freetype2)
 LIBS   += $(shell pkg-config --libs freetype2)
 endif
+
+ifeq ($(VAAPI),1)
+CONFIG += -DVAAPI
+LIBPLACEBO=1
+PLUGIN = softhdvaapi
+endif
+
 ifeq ($(LIBPLACEBO),1)
 CONFIG += -DPLACEBO
 endif
 
+ifeq ($(CUVID),1)
+CONFIG += -DCUVID			# enable CUVID decoder
+ifeq ($(YADIF),1)
+CONFIG += -DYADIF			# Yadif only with CUVID
+endif
+endif
+
+
+
+
+ARCHIVE = $(PLUGIN)-$(VERSION)
+PACKAGE = vdr-$(ARCHIVE)
+
+### The name of the shared object file:
+
+SOFILE = libvdr-$(PLUGIN).so
+
+
 #
 # Test that libswresample is available 
 #
-ifneq (exists, $(shell pkg-config libswresample && echo exists))
-  $(warning ******************************************************************)
-  $(warning 'libswresample' not found!)
-  $(error ******************************************************************)
-endif
+#ifneq (exists, $(shell pkg-config libswresample && echo exists))
+#  $(warning ******************************************************************)
+#  $(warning 'libswresample' not found!)
+#  $(error ******************************************************************)
+#endif
 
 #
 # Test and set config for libavutil 
@@ -179,11 +213,11 @@ CONFIG += -DUSE_SWRESAMPLE
 _CFLAGS += $(shell pkg-config --cflags libswresample)
 LIBS += $(shell pkg-config --libs libswresample)
 endif
-#ifeq ($(AVRESAMPLE),1)
-#CONFIG += -DUSE_AVRESAMPLE
-#_CFLAGS += $(shell pkg-config --cflags libavresample)
-#LIBS += $(shell pkg-config --libs libavresample)
-#endif
+ifeq ($(AVRESAMPLE),1)
+CONFIG += -DUSE_AVRESAMPLE
+_CFLAGS += $(shell pkg-config --cflags libavresample)
+LIBS += $(shell pkg-config --libs libavresample)
+endif
 
 #_CFLAGS += $(shell pkg-config --cflags libavcodec x11 x11-xcb xcb xcb-icccm)
 #LIBS += -lrt $(shell pkg-config --libs libavcodec x11 x11-xcb xcb xcb-icccm)
@@ -200,7 +234,11 @@ ifeq ($(LIBPLACEBO),1)
 LIBS += -lplacebo -lglut
 endif
 
-LIBS += -lGLEW  -lGLX -ldl -lcuda  -L/usr/local/cuda/targets/x86_64-linux/lib -lcudart -lnvcuvid 
+ifeq ($(CUVID),1)
+LIBS +=  -lcuda  -L/usr/local/cuda/targets/x86_64-linux/lib -lcudart -lnvcuvid
+endif
+
+LIBS += -lGLEW  -lGLX -ldl  
 ### Includes and Defines (add further entries here):
 
 INCLUDES +=
@@ -218,12 +256,12 @@ override CFLAGS	  += $(_CFLAGS) $(DEFINES) $(INCLUDES) \
 
 ### The object files (add further files here):
 
-OBJS = $(PLUGIN).o softhddev.o video.o audio.o codec.o ringbuffer.o  
+OBJS = softhdcuvid.o softhddev.o video.o audio.o codec.o ringbuffer.o  
 ifeq ($(OPENGLOSD),1)
 OBJS += openglosd.o 
 endif
 
-SRCS = $(wildcard $(OBJS:.o=.c)) $(PLUGIN).cpp
+SRCS = $(wildcard $(OBJS:.o=.c)) softhdcuvid.cpp
 
 ### The main target:
 
diff --git a/README.md b/README.md
index a3e448b..42fb834 100644
--- a/README.md
+++ b/README.md
@@ -40,13 +40,17 @@ A software and GPU emulated UHD output device plugin for VDR.
 To compile you must have the 'requires' installed. 
 
 
-This is a fork of johns original softhddevice work and I reworked ist to support HEVC with CUDA and opengl output.
+This is a fork of johns original softhddevice work and I reworked it to support HEVC with CUDA and opengl output.
 Currently I have tested it with a GTX 1050 from NVIDIA. SD, HD and UHD is working.
 
-Current Status NVIDA:
+Current Status NVIDIA:
 The CUDA driver supports HEVC with 8 Bit and 10 Bit up to UHD resolution. Opengl is able to output also 10 Bit, but NVIDIA does not support to output 10 Bit via HDMI.
 Only via DisplayPort you can get 10 Bit output to a compatible screen. This is a restriction from NVIDIA.
 
+Current Status with VAAPI
+You need libplacebo.
+It is still beta and I tested it with Intel VAAPI. If you have problmes with the shaders then copy the drirc file in your home directory as .drirc
+AMD VAAPI is broken by AMD and will not work currently. The vaapi_deinterlace is broken and the amdgpu driver is instable. I have not testet with amdgpupro
 
 You have to adapt the Makefile to your needs. I use FFMPEG 4.0 
 The Makefile expects the CUDA SDK in /usr/local/cuda. Currently it is tested with CUDA 10
diff --git a/codec.c b/codec.c
index fb7a550..2a170d5 100644
--- a/codec.c
+++ b/codec.c
@@ -1,3 +1,4 @@
+
 ///
 ///	@file codec.c	@brief Codec functions
 ///
@@ -200,16 +201,20 @@ static int Codec_get_buffer2(AVCodecContext * video_ctx, AVFrame * frame, int fl
     if (!decoder->GetFormatDone) {	// get_format missing
 		enum AVPixelFormat fmts[2];
 
-		fprintf(stderr, "codec: buggy libav, use ffmpeg\n");
-		Warning(_("codec: buggy libav, use ffmpeg\n"));
+//		fprintf(stderr, "codec: buggy libav, use ffmpeg\n");
+//		Warning(_("codec: buggy libav, use ffmpeg\n"));
 		fmts[0] = video_ctx->pix_fmt;
 		fmts[1] = AV_PIX_FMT_NONE;
 		Codec_get_format(video_ctx, fmts);
     }
-    if (decoder->hwaccel_get_buffer && (AV_PIX_FMT_VDPAU == decoder->hwaccel_pix_fmt || AV_PIX_FMT_CUDA == decoder->hwaccel_pix_fmt)) {
+#if 0
+    if (decoder->hwaccel_get_buffer && (AV_PIX_FMT_VDPAU == decoder->hwaccel_pix_fmt || 
+										 AV_PIX_FMT_CUDA  == decoder->hwaccel_pix_fmt ||
+									    AV_PIX_FMT_VAAPI == decoder->hwaccel_pix_fmt)) {
            //Debug(3,"hwaccel get_buffer\n");
            return decoder->hwaccel_get_buffer(video_ctx, frame, flags);
     }
+#endif
     //Debug(3, "codec: fallback to default get_buffer\n");
     return avcodec_default_get_buffer2(video_ctx, frame, flags);
 }
@@ -268,6 +273,7 @@ void CodecVideoOpen(VideoDecoder * decoder, int codec_id)
     }
   
     name = "NULL";
+#ifdef CUVID
     if (!strcasecmp(VideoGetDriverName(), "cuvid")) {
 		switch (codec_id) {
 		case AV_CODEC_ID_MPEG2VIDEO:
@@ -281,11 +287,11 @@ void CodecVideoOpen(VideoDecoder * decoder, int codec_id)
 			break;	
 		}
     }
-	
+#endif	
     if (name && (video_codec = avcodec_find_decoder_by_name(name))) {
 		Debug(3, "codec: decoder found\n");
-    } else {
-		Debug(3,"Decoder %s not supported\n",name);
+    } else if ((video_codec = avcodec_find_decoder(codec_id))==NULL) {
+		Debug(3,"Decoder %s not supported %p\n",name,video_codec);
 		Fatal(_(" No decoder found"));
 	}
 
@@ -309,13 +315,45 @@ void CodecVideoOpen(VideoDecoder * decoder, int codec_id)
 	decoder->VideoCtx->pkt_timebase.den = 90000;
 	decoder->VideoCtx->framerate.num = 50;
 	decoder->VideoCtx->framerate.den = 1;
+	decoder->VideoCtx->extra_hw_frames = 8;  // VIDEO_SURFACES_MAX +1
 	
     pthread_mutex_lock(&CodecLockMutex);
     // open codec
 #ifdef YADIF
-	deint = 0;
+	deint = 2;
 #endif
+#ifdef VAAPI
+    if (video_codec->capabilities & (AV_CODEC_CAP_AUTO_THREADS)) {
+        Debug(3,"codec: auto threads enabled");
+        decoder->VideoCtx->thread_count = 0;
+    }
 
+    if (video_codec->capabilities & AV_CODEC_CAP_TRUNCATED) {
+        Debug(3,"codec: supports truncated packets");
+        //decoder->VideoCtx->flags |= CODEC_FLAG_TRUNCATED;
+    }
+    // FIXME: own memory management for video frames.
+    if (video_codec->capabilities & AV_CODEC_CAP_DR1) {
+        Debug(3,"codec: can use own buffer management");
+    }
+    if (video_codec->capabilities & AV_CODEC_CAP_FRAME_THREADS) {
+        Debug(3,"codec: supports frame threads");
+        decoder->VideoCtx->thread_count = 0;
+ //       decoder->VideoCtx->thread_type |= FF_THREAD_FRAME;
+    }
+    if (video_codec->capabilities & AV_CODEC_CAP_SLICE_THREADS) {
+        Debug(3,"codec: supports slice threads");
+        decoder->VideoCtx->thread_count = 0;
+ //       decoder->VideoCtx->thread_type |= FF_THREAD_SLICE;
+    }
+    if (av_opt_set_int(decoder->VideoCtx, "refcounted_frames", 1, 0)<0)
+		Fatal(_("VAAPI Refcounts invalid\n"));
+	decoder->VideoCtx->thread_safe_callbacks = 0;
+#endif
+	
+	
+	
+#ifdef CUVID
 	if (strcmp(decoder->VideoCodec->long_name,"Nvidia CUVID MPEG2VIDEO decoder") == 0) {  // deinterlace for mpeg2 is somehow broken 
 		if (av_opt_set_int(decoder->VideoCtx->priv_data, "deint", deint ,0) < 0) {  // adaptive
 		  pthread_mutex_unlock(&CodecLockMutex);
@@ -348,7 +386,7 @@ void CodecVideoOpen(VideoDecoder * decoder, int codec_id)
 		  Fatal(_("codec: can't set option drop 2.field  to video codec!\n"));
 		}
 	}
-
+#endif
 
     if ((ret = avcodec_open2(decoder->VideoCtx, video_codec, NULL)) < 0) {
 		pthread_mutex_unlock(&CodecLockMutex);
@@ -363,24 +401,25 @@ void CodecVideoOpen(VideoDecoder * decoder, int codec_id)
     //decoder->VideoCtx->debug = FF_DEBUG_STARTCODE;
     //decoder->VideoCtx->err_recognition |= AV_EF_EXPLODE;
 
+//	av_log_set_level(AV_LOG_DEBUG);
+	av_log_set_level(0);
 
 	decoder->VideoCtx->get_format = Codec_get_format;
 	decoder->VideoCtx->get_buffer2 = Codec_get_buffer2;
-	decoder->VideoCtx->thread_count = 1;
-	decoder->VideoCtx->active_thread_type = 0;
+//	decoder->VideoCtx->active_thread_type = 0;
 	decoder->VideoCtx->draw_horiz_band = NULL;
-	if (strstr(decoder->VideoCodec->long_name,"Nvidia CUVID") != NULL) 
-		decoder->VideoCtx->hwaccel_context = VideoGetHwAccelContext(decoder->HwDecoder);
+	decoder->VideoCtx->hwaccel_context = VideoGetHwAccelContext(decoder->HwDecoder);
 
 
     //
     //	Prepare frame buffer for decoder
     //
-
+#if 0
     if (!(decoder->Frame = av_frame_alloc())) {
 		Fatal(_("codec: can't allocate video decoder frame buffer\n"));
     }
-
+#endif
+	
     // reset buggy ffmpeg/libav flag
     decoder->GetFormatDone = 0;
 #ifdef YADIF
@@ -401,7 +440,7 @@ void CodecVideoClose(VideoDecoder *video_decoder)
 {
 		
     // FIXME: play buffered data
-	av_frame_free(&video_decoder->Frame);	// callee does checks
+//	av_frame_free(&video_decoder->Frame);	// callee does checks
 
 	Debug(3,"CodecVideoClose\n");
     if (video_decoder->VideoCtx) {
@@ -465,37 +504,100 @@ extern int CuvidTestSurfaces();
 extern int init_filters(AVCodecContext * dec_ctx,void * decoder,AVFrame *frame);
 extern int push_filters(AVCodecContext * dec_ctx,void * decoder,AVFrame *frame);
 #endif
+#ifdef VAAPI
+void CodecVideoDecode(VideoDecoder * decoder, const AVPacket * avpkt)
+{
+    AVCodecContext *video_ctx = decoder->VideoCtx;
+
+    if (video_ctx->codec_type == AVMEDIA_TYPE_VIDEO) {
+		int ret;
+		AVPacket pkt[1];
+		AVFrame *frame;
+
+		*pkt = *avpkt;			// use copy
+		ret = avcodec_send_packet(video_ctx, pkt);
+		if (ret < 0) {
+			Debug(4,"codec: sending video packet failed");
+			return;
+		}
+		frame = av_frame_alloc();
+		ret = avcodec_receive_frame(video_ctx, frame);
+		if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF) {
+			Debug(4,"codec: receiving video frame failed");
+			av_frame_free(&frame);
+			return;
+		}
+		if (ret >= 0) {
+			if (decoder->filter ) {
+				if (decoder->filter == 1) {
+					if (init_filters(video_ctx,decoder->HwDecoder,frame) < 0) {
+						Debug(3,"video: Init of VAAPI deint Filter failed\n");
+						decoder->filter = 0;
+					}
+					else {
+						Debug(3,"Init VAAPI deint ok\n");
+						decoder->filter = 2;
+					}
+				}
+				if (frame->interlaced_frame && decoder->filter == 2 && (frame->height != 720)) {  // broken ZDF sends Interlaced flag
+					ret = push_filters(video_ctx,decoder->HwDecoder,frame);
+					return;
+				}
+			}			
+			VideoRenderFrame(decoder->HwDecoder, video_ctx, frame);
+		}
+		else {
+			av_frame_free(&frame);
+		}
+    }
+}
+#endif
+#ifdef CUVID 
+
 void CodecVideoDecode(VideoDecoder * decoder, const AVPacket * avpkt)
 {
     AVCodecContext *video_ctx;
-    AVFrame *frame;
+    AVFrame *frame 
+
+;
     int ret,ret1;
     int got_frame;
     int consumed = 0;
+	static uint64_t first_time = 0;
     const AVPacket *pkt;
-
+	
 next_part:
     video_ctx = decoder->VideoCtx;
-    frame = decoder->Frame;
+
     pkt = avpkt;			// use copy
     got_frame = 0;
+	
+//	printf("decode packet  %d\n",(GetusTicks()-first_time)/1000000);	
     ret1 = avcodec_send_packet(video_ctx, pkt);
 	
+//	first_time = GetusTicks();
+	
 	if (ret1 >= 0) {
 		consumed = 1;
-	}		
+	}
+
+	if (!CuvidTestSurfaces())
+		usleep(1000);
+
+//printf("send packet to decode %s\n",consumed?"ok":"Full");
+	
 	if ((ret1 == AVERROR(EAGAIN) || ret1 == AVERROR_EOF || ret1 >= 0) && CuvidTestSurfaces()) { 
 		ret = 0;
 		while ((ret >= 0) && CuvidTestSurfaces()) {    // get frames until empty snd Surfaces avail.
-			
+			frame = av_frame_alloc();
  	       	ret = avcodec_receive_frame(video_ctx, frame);   // get new frame
 			if (ret >= 0) {									// one is avail.
-				got_frame = 1;
+				got_frame = 1;			
 			}
 			else {
 	            got_frame = 0;
 			}
-			
+//			printf("got %s packet from decoder\n",got_frame?"1":"no");
 			if (got_frame) {			// frame completed
 #ifdef YADIF	
 				if (decoder->filter ) {
@@ -511,12 +613,13 @@ next_part:
 					}
 					if (frame->interlaced_frame && decoder->filter == 2 && (frame->height != 720)) {  // broken ZDF sends Interlaced flag
 						ret = push_filters(video_ctx,decoder->HwDecoder,frame);
-						av_frame_unref(frame);
+//						av_frame_unref(frame);
 						continue;
 					}
 				}
 #endif			
 #ifdef FFMPEG_WORKAROUND_ARTIFACTS
+
 				if (!CodecUsePossibleDefectFrames && decoder->FirstKeyFrame) {
 					decoder->FirstKeyFrame++;
 					if (frame->key_frame || (decoder->FirstKeyFrame > 3)) {  // key frame is not reliable
@@ -524,31 +627,35 @@ next_part:
 						decoder->FirstKeyFrame = 0;
 						VideoRenderFrame(decoder->HwDecoder, video_ctx, frame);	
 					}
-					av_frame_unref(frame);	
+//					av_frame_unref(frame);	
 				} else {
 					//DisplayPts(video_ctx, frame);
 					VideoRenderFrame(decoder->HwDecoder, video_ctx, frame);
-					av_frame_unref(frame);
+//					av_frame_unref(frame);
 				}
 #else
 				//DisplayPts(video_ctx, frame);
 				VideoRenderFrame(decoder->HwDecoder, video_ctx, frame);
-				av_frame_unref(frame);
+//				av_frame_unref(frame);
 #endif
-//				printf("got frame\n");
-			} else {				
+			} else {
+				av_frame_free(&frame);
 //				printf("codec: got no frame %d  send %d\n",ret,ret1);
 			}
 		}
+		if (!CuvidTestSurfaces()) {
+			usleep(1000);
+		}
 	} else {
 //		consumed = 1;
 	}
 	
 	if (!consumed) {
 		goto next_part;					// try again to stuff decoder
-	}	
-}
+	}
 
+}
+#endif
 
 /**
 **	Flush the video decoder.
@@ -702,7 +809,7 @@ void CodecAudioOpen(AudioDecoder * audio_decoder, int codec_id)
 
     Debug(3, "codec: using audio codec ID %#06x (%s)\n", codec_id,
 	avcodec_get_name(codec_id));
-    if (!(audio_codec = avcodec_find_decoder_by_name(avcodec_get_name(codec_id)))) {
+    if (!(audio_codec = avcodec_find_decoder(codec_id))) {
 //    if (!(audio_codec = avcodec_find_decoder(codec_id))) {
 	Fatal(_("codec: codec ID %#06x not found\n"), codec_id);
 	// FIXME: errors aren't fatal
@@ -1291,7 +1398,7 @@ void CodecAudioEnqueue(AudioDecoder * audio_decoder, int16_t * data, int count)
 #ifdef USE_AUDIO_DRIFT_CORRECTION
     if ((CodecAudioDrift & CORRECT_PCM) && audio_decoder->AvResample) {
 	int16_t buf[(AVCODEC_MAX_AUDIO_FRAME_SIZE * 3) / 4 +
-	    FF_INPUT_BUFFER_PADDING_SIZE] __attribute__ ((aligned(16)));
+	    AV_INPUT_BUFFER_PADDING_SIZE] __attribute__ ((aligned(16)));
 	int16_t buftmp[MAX_CHANNELS][(AVCODEC_MAX_AUDIO_FRAME_SIZE * 3) / 4];
 	int consumed;
 	int i;
@@ -1383,7 +1490,7 @@ int myavcodec_decode_audio3(AVCodecContext *avctx, int16_t *samples,
 //  in the caller may be able to be optimized.
     ret = avcodec_receive_frame(avctx,frame);
     if (ret == 0)
-        got_frame = true;
+        got_frame = 1;
     if (ret == AVERROR(EAGAIN))
         ret = 0;
     if (ret == 0)
@@ -1392,7 +1499,7 @@ int myavcodec_decode_audio3(AVCodecContext *avctx, int16_t *samples,
         ret = 0;
     else if (ret < 0)
     {
-        Debug(3, "codec/audio: audio decode error: %1 (%2)\n",av_make_error_string(error, sizeof(error), ret),got_frame);
+//        Debug(3, "codec/audio: audio decode error: %1 (%2)\n",av_make_error_string(error, sizeof(error), ret),got_frame);
         return ret;
     }
     else
@@ -1434,7 +1541,7 @@ int myavcodec_decode_audio3(AVCodecContext *avctx, int16_t *samples,
 void CodecAudioDecode(AudioDecoder * audio_decoder, const AVPacket * avpkt)
 {
     int16_t buf[(AVCODEC_MAX_AUDIO_FRAME_SIZE * 3) / 4 +
-	FF_INPUT_BUFFER_PADDING_SIZE] __attribute__ ((aligned(16)));
+	AV_INPUT_BUFFER_PADDING_SIZE] __attribute__ ((aligned(16)));
     int buf_sz;
     int l;
     AVCodecContext *audio_ctx;
@@ -1470,7 +1577,7 @@ void CodecAudioDecode(AudioDecoder * audio_decoder, const AVPacket * avpkt)
 		// need to resample audio
 		if (audio_decoder->ReSample) {
 			int16_t outbuf[(AVCODEC_MAX_AUDIO_FRAME_SIZE * 3) / 4 +
-			FF_INPUT_BUFFER_PADDING_SIZE]
+			AV_INPUT_BUFFER_PADDING_SIZE]
 			__attribute__ ((aligned(16)));
 			int outlen;
 
diff --git a/codec.h b/codec.h
index 3b61c5a..6396968 100644
--- a/codec.h
+++ b/codec.h
@@ -61,10 +61,10 @@ struct _video_decoder_
  //#ifdef FFMPEG_WORKAROUND_ARTIFACTS
      int FirstKeyFrame;                  ///< flag first frame
  //#endif
-     AVFrame *Frame;                     ///< decoded video frame
-#ifdef YADIF
-	 int filter;						// flag for yadif filter
-#endif
+//     AVFrame *Frame;                     ///< decoded video frame
+
+	 int filter;						// flag for deint filter
+
      /* hwaccel options */
      enum HWAccelID hwaccel_id;
      char  *hwaccel_device;
diff --git a/drirc b/drirc
new file mode 100644
index 0000000..f9b1e69
--- /dev/null
+++ b/drirc
@@ -0,0 +1,28 @@
+<driconf>
+    <device screen="0" driver="i965">
+        <application name="Default">
+            <option name="always_flush_cache" value="false" />
+            <option name="mesa_no_error" value="false" />
+            <option name="precise_trig" value="false" />
+            <option name="mesa_glthread" value="false" />
+            <option name="disable_glsl_line_continuations" value="false" />
+            <option name="disable_blend_func_extended" value="false" />
+            <option name="shader_precompile" value="true" />
+            <option name="clamp_max_samples" value="-1" />
+            <option name="allow_glsl_extension_directive_midshader" value="false" />
+            <option name="allow_rgb10_configs" value="true" />
+            <option name="allow_glsl_cross_stage_interpolation_mismatch" value="false" />
+            <option name="force_glsl_abs_sqrt" value="false" />
+            <option name="dual_color_blend_by_location" value="false" />
+            <option name="bo_reuse" value="1" />
+            <option name="always_flush_batch" value="false" />
+            <option name="allow_rgb565_configs" value="true" />
+            <option name="allow_glsl_builtin_variable_redeclaration" value="true" />
+            <option name="force_glsl_extensions_warn" value="false" />
+            <option name="disable_throttling" value="false" />
+            <option name="force_glsl_version" value="330" />
+            <option name="glsl_zero_init" value="false" />
+            <option name="allow_higher_compat_version" value="true" />
+        </application>
+    </device>
+</driconf>
diff --git a/openglosd.cpp b/openglosd.cpp
index cd52655..595f131 100644
--- a/openglosd.cpp
+++ b/openglosd.cpp
@@ -18,7 +18,7 @@ void ConvertColor(const GLint &colARGB, glm::vec4 &col) {
 * cShader
 ****************************************************************************************/
 
-
+#ifdef CUVID
 const char *rectVertexShader = 
 "#version 330 core \n\
 \
@@ -116,6 +116,106 @@ void main() \
 } \
 ";
 
+#else
+
+const char *rectVertexShader = 
+"\n\
+
+\
+layout (location = 0) in vec2 position; \
+out vec4 rectCol; \
+uniform vec4 inColor; \
+uniform mat4 projection; \
+\
+void main() \
+{ \
+    gl_Position = projection * vec4(position.x, position.y, 0.0, 1.0); \
+    rectCol = inColor; \
+} \
+";
+
+const char *rectFragmentShader = 
+"#version 330 core \n\
+\
+in vec4 rectCol; \
+out vec4 color; \
+\
+void main() \
+{ \
+    color = rectCol; \
+} \
+";
+
+const char *textureVertexShader = 
+"\n\
+\
+layout (location = 0) in vec2 position; \
+layout (location = 1) in vec2 texCoords; \
+\
+out vec2 TexCoords; \
+out vec4 alphaValue;\
+\
+uniform mat4 projection; \
+uniform vec4 alpha; \
+\
+void main() \
+{ \
+    gl_Position = projection * vec4(position.x, position.y, 0.0, 1.0); \
+    TexCoords = texCoords; \
+    alphaValue = alpha; \
+} \
+";
+
+const char *textureFragmentShader = 
+"\n\
+in vec2 TexCoords; \
+in vec4 alphaValue; \
+out vec4 color; \
+\
+uniform sampler2D screenTexture; \
+\
+void main() \
+{ \
+    color = texture(screenTexture, TexCoords) * alphaValue; \
+} \
+";
+
+const char *textVertexShader = 
+"\n\
+\
+layout (location = 0) in vec2 position; \
+layout (location = 1) in vec2 texCoords; \
+\
+out vec2 TexCoords; \
+out vec4 textColor; \
+\
+uniform mat4 projection; \
+uniform vec4 inColor; \
+\
+void main() \
+{ \
+    gl_Position = projection * vec4(position.x, position.y, 0.0, 1.0); \
+    TexCoords = texCoords; \
+    textColor = inColor; \
+} \
+";
+
+const char *textFragmentShader = 
+"\n\
+in vec2 TexCoords; \
+in vec4 textColor; \
+\
+out vec4 color; \
+\
+uniform sampler2D glyphTexture; \
+\
+void main() \
+{  \
+    vec4 sampled = vec4(1.0, 1.0, 1.0, texture(glyphTexture, TexCoords).r); \
+    color = textColor * sampled; \
+} \
+";
+#endif
 static cShader *Shaders[stCount]; 
 
 void cShader::Use(void) {
@@ -1588,7 +1688,7 @@ extern "C" int GlxInitopengl();
 bool cOglThread::InitOpenGL(void) {
 	
 
-#ifdef PLACEBO	
+#ifdef PLACEBO
     const char *displayName = X11DisplayName;
     if (!displayName) {
         displayName = getenv("DISPLAY");
diff --git a/softhdcuvid.cpp b/softhdcuvid.cpp
index cdaf47f..4eb4b1a 100644
--- a/softhdcuvid.cpp
+++ b/softhdcuvid.cpp
@@ -67,7 +67,7 @@ extern "C"
     /// vdr-plugin version number.
     /// Makefile extracts the version number for generating the file name
     /// for the distribution archive.
-static const char *const VERSION = "1.1.0"
+static const char *const VERSION = "2.0.0"
 #ifdef GIT_REV
     "-GIT" GIT_REV
 #endif
@@ -75,7 +75,7 @@ static const char *const VERSION = "1.1.0"
 
     /// vdr-plugin description.
 static const char *const DESCRIPTION =
-trNOOP("A software and GPU emulated HD device");
+trNOOP("A software and GPU emulated UHD device");
 
     /// vdr-plugin text of main menu entry
 static const char *MAINMENUENTRY = trNOOP("SoftUHD");
@@ -232,7 +232,7 @@ class cSoftRemote:public cRemote
     **	@param release	flag key released
     */
     bool Put(const char *code, bool repeat = false, bool release = false) {
-	return cRemote::Put(code, repeat, release);
+		return cRemote::Put(code, repeat, release);
     }
 };
 
@@ -249,40 +249,38 @@ extern "C" void FeedKeyPress(const char *keymap, const char *key, int repeat,
     int release, const char *letter)
 {
     cRemote *remote;
-    cSoftRemote *csoft;
+    cSoftRemote *csoft; 
 
     if (!keymap || !key) {
-	return;
+		return;
     }
     // find remote
     for (remote = Remotes.First(); remote; remote = Remotes.Next(remote)) {
-	if (!strcmp(remote->Name(), keymap)) {
-	    break;
-	}
+		if (!strcmp(remote->Name(), keymap)) {
+			break;
+		}
     }
     // if remote not already exists, create it
     if (remote) {
-	csoft = (cSoftRemote *) remote;
+		csoft = (cSoftRemote *) remote;
     } else {
-	dsyslog("[softhddev]%s: remote '%s' not found\n", __FUNCTION__,
-	    keymap);
-	csoft = new cSoftRemote(keymap);
+		dsyslog("[softhddev]%s: remote '%s' not found\n", __FUNCTION__, keymap);
+		csoft = new cSoftRemote(keymap);
     }
 
     //dsyslog("[softhddev]%s %s, %s, %s\n", __FUNCTION__, keymap, key, letter);
     if (key[1]) {			// no single character
-	if (!csoft->Put(key, repeat, release) && letter
-	    && !cRemote::IsLearning()) {
-	    cCharSetConv conv;
-	    unsigned code;
+		if (!csoft->Put(key, repeat, release) && letter && !cRemote::IsLearning()) {
+			cCharSetConv conv;
+			unsigned code;
 
-	    code = Utf8CharGet(conv.Convert(letter));
-	    if (code <= 0xFF) {
-		cRemote::Put(KBDKEY(code));	// feed it for edit mode
-	    }
-	}
+			code = Utf8CharGet(conv.Convert(letter));
+			if (code <= 0xFF) {
+				cRemote::Put(KBDKEY(code));	// feed it for edit mode
+			}
+		}
     } else if (!csoft->Put(key, repeat, release)) {
-	cRemote::Put(KBDKEY(key[0]));	// feed it for edit mode
+		cRemote::Put(KBDKEY(key[0]));	// feed it for edit mode
     }
 }
 
diff --git a/video.c b/video.c
index 7a0fac0..5cfbeb4 100644
--- a/video.c
+++ b/video.c
@@ -1,4 +1,3 @@
-
 ///
 ///	@file video.c	@brief Video module
 ///
@@ -153,11 +152,12 @@ typedef enum
 #include <GL/freeglut_ext.h>
 #endif
 
+#include <libavutil/hwcontext.h>
+
 #ifdef CUVID
 //#define CUDA_API_PER_THREAD_DEFAULT_STREAM
 #include <GL/gl.h>			// For GL_COLOR_BUFFER_BIT
 //#include <GL/glext.h>			// For GL_COLOR_BUFFER_BIT 
-#include <libavutil/hwcontext.h>
 #include <cuda.h>
 //#include <dynlink_cuda.h>
 #include <cuda_runtime_api.h>
@@ -169,6 +169,18 @@ typedef enum
 #define __DEVICE_TYPES_H__
 #endif
 
+#ifdef VAAPI
+#include <va/va_drmcommon.h>
+#include <libavcodec/vaapi.h>
+#include <libavutil/hwcontext_vaapi.h>
+#define TO_AVHW_DEVICE_CTX(x) ((AVHWDeviceContext*)x->data)
+#define TO_AVHW_FRAMES_CTX(x) ((AVHWFramesContext*)x->data)
+
+#define TO_VAAPI_DEVICE_CTX(x) ((AVVAAPIDeviceContext*)TO_AVHW_DEVICE_CTX(x)->hwctx)
+#define TO_VAAPI_FRAMES_CTX(x) ((AVVAAPIFramesContext*)TO_AVHW_FRAMES_CTX(x)->hwctx)
+#endif
+
+
 #ifdef PLACEBO
 
 #define VK_USE_PLATFORM_XCB_KHR
@@ -197,7 +209,7 @@ typedef enum
 #include <libavutil/pixdesc.h>
 #include <libavutil/hwcontext.h>
 
-#ifdef YADIF
+#if defined(YADIF) || defined (VAAPI)
 #include <libavfilter/buffersink.h>
 #include <libavfilter/buffersrc.h>
 #include <libavutil/opt.h>
@@ -340,7 +352,7 @@ typedef struct {
 
 #define CODEC_SURFACES_MAX	16	///< maximal of surfaces
 
-#define VIDEO_SURFACES_MAX	8	///< video output surfaces for queue
+#define VIDEO_SURFACES_MAX	6	///< video output surfaces for queue
 //#define OUTPUT_SURFACES_MAX	4	///< output surfaces for flip page
 
 //----------------------------------------------------------------------------
@@ -453,7 +465,7 @@ static pthread_mutex_t VideoLockMutex;	///< video lock mutex
 pthread_mutex_t OSDMutex;			///< OSD update mutex
 #endif
 
-
+int skipwait;
 
 static pthread_t VideoDisplayThread;	///< video display thread
 //static pthread_cond_t VideoDisplayWakeupCond;	///< wakeup condition variable
@@ -502,8 +514,8 @@ GLXContext OSDcontext;
 //	Common Functions
 //----------------------------------------------------------------------------
 
-static void VideoThreadLock(void);	///< lock video thread
-static void VideoThreadUnlock(void);	///< unlock video thread
+void VideoThreadLock(void);	///< lock video thread
+void VideoThreadUnlock(void);	///< unlock video thread
 static void VideoThreadExit(void);	///< exit/kill video thread
 
 #ifdef USE_SCREENSAVER
@@ -897,6 +909,7 @@ static void GlxUploadOsdTexture(int x, int y, int width, int height,
 ///
 static void GlxOsdInit(__attribute__((unused))int width, __attribute__((unused))int height)
 {
+	int i;
 #ifdef DEBUG
     if (!GlxEnabled) {
 		Debug(3, "video/glx: %s called without glx enabled\n", __FUNCTION__);
@@ -1596,16 +1609,27 @@ static void AutoCropDetect(AutoCropCtx * autocrop, int width, int height,
 #ifdef PLACEBO
 struct ext_buf {
     int fd;
+#ifdef CUVID
     CUexternalMemory mem;
     CUdeviceptr buf;
+#endif
 };
 #endif
 
+#ifdef VAAPI
+static VADisplay *VaDisplay;		///< VA-API display
+#endif
+
 ///
 ///	CUVID decoder
 ///
 typedef struct _cuvid_decoder_
 {
+#ifdef VAAPI	
+	VADisplay *VaDisplay;		///< VA-API display
+#endif
+
+
      xcb_window_t Window;		///< output window
 
     int VideoX;				///< video base x coordinate
@@ -1661,17 +1685,18 @@ typedef struct _cuvid_decoder_
     int SurfaceWrite;			///< write pointer
     int SurfaceRead;			///< read pointer
     atomic_t SurfacesFilled;		///< how many of the buffer is used
-	
+	AVFrame *frames[CODEC_SURFACES_MAX+1];
+#ifdef CUVID	
 	CUarray      		 cu_array[CODEC_SURFACES_MAX+1][2];
 	CUgraphicsResource   cu_res[CODEC_SURFACES_MAX+1][2];
 	GLuint gl_textures[(CODEC_SURFACES_MAX+1)*2];  // where we will copy the CUDA result
 	CUcontext cuda_ctx;
-	
+#endif	
 #ifdef PLACEBO
 	struct pl_image     	  pl_images[CODEC_SURFACES_MAX+1];    // images for Placebo chain
-	const struct pl_tex		 *pl_tex_in[CODEC_SURFACES_MAX+1][2];  // Textures in image
-	const struct pl_buf		 *pl_buf_Y,*pl_buf_UV;				 // buffer for Texture upload
-	struct ext_buf			 ebuf[2];							 // for managing vk buffer
+//	const struct pl_tex		 *pl_tex_in[CODEC_SURFACES_MAX+1][2];  // Textures in image
+	const struct pl_buf		 *pl_buf_Y[2],*pl_buf_UV[2];				 // buffer for Texture upload
+	struct ext_buf			 ebuf[4];							 // for managing vk buffer
 #endif
 	
 	
@@ -1684,7 +1709,7 @@ typedef struct _cuvid_decoder_
     int SyncOnAudio;			///< flag sync to audio
     int64_t PTS;			///< video PTS clock
 	
-#ifdef YADIF
+#if defined(YADIF) || defined (VAAPI)
 	AVFilterContext *buffersink_ctx;
 	AVFilterContext *buffersrc_ctx;
 	AVFilterGraph *filter_graph;
@@ -1720,6 +1745,7 @@ typedef struct priv {
 	struct pl_tex      final_fbo;
 	VkSurfaceKHR  pSurface;
 	VkSemaphore sig_in;
+	int has_dma_buf;
 }priv;
 static priv *p;
 static struct pl_overlay osdoverlay;
@@ -1810,7 +1836,7 @@ int CuvidMessage(int level, const char *format, ...)
 
 ////////////////////////////////////////////////////////////////////////////////
 // These are CUDA Helper functions
-
+#ifdef CUVID
 // This will output the proper CUDA error strings in the event that a CUDA host call returns an error
 #define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)
 
@@ -1823,6 +1849,7 @@ static inline void __checkCudaErrors(CUresult err, const char *file, const int l
         exit(EXIT_FAILURE);
     }
 }
+#endif
 //	Surfaces -------------------------------------------------------------
 
 void
@@ -1875,22 +1902,32 @@ static void CuvidDestroySurfaces(CuvidDecoder * decoder)
 #endif
 	
 	for (i=0;i<decoder->SurfacesNeeded;i++) {
+		if (decoder->frames[i]) {
+			av_frame_free(&decoder->frames[i]);
+		}
 		for (j=0;j<2;j++) {
-#ifdef PLACEBO
-			pl_tex_destroy(p->gpu,&decoder->pl_tex_in[i][j]);
+#ifdef PLACEBO		
+			if (decoder->pl_images[i].planes[j].texture) {
+
+#ifdef VAAPI				
+				if (p->has_dma_buf && decoder->pl_images[i].planes[j].texture->params.shared_mem.handle.fd) {
+					close(decoder->pl_images[i].planes[j].texture->params.shared_mem.handle.fd);
+				}
+#endif
+				pl_tex_destroy(p->gpu,&decoder->pl_images[i].planes[j].texture);
+			}
 #else
 			checkCudaErrors(cuGraphicsUnregisterResource(decoder->cu_res[i][j]));			
 #endif
 		}
 	}
 #ifdef PLACEBO
-// Never ever close the FD this will corrupt cuda
-//	if (decoder->pl_buf_Y->handles.fd > 0)
-//		close(decoder->pl_buf_Y->handles.fd);
-//	if (decoder->pl_buf_UV->handles.fd > 0)
-//		close(decoder->pl_buf_UV->handles.fd);
-	pl_buf_destroy(p->gpu,&decoder->pl_buf_Y);
-	pl_buf_destroy(p->gpu,&decoder->pl_buf_UV);
+#ifdef CUVID
+	pl_buf_destroy(p->gpu,&decoder->pl_buf_Y[0]);
+	pl_buf_destroy(p->gpu,&decoder->pl_buf_UV[0]);
+	pl_buf_destroy(p->gpu,&decoder->pl_buf_Y[1]);
+	pl_buf_destroy(p->gpu,&decoder->pl_buf_UV[1]);
+#endif
 	pl_renderer_destroy(&p->renderer);
 	p->renderer = pl_renderer_create(p->ctx, p->gpu);
 #else
@@ -1957,7 +1994,25 @@ static int CuvidGetVideoSurface0(CuvidDecoder * decoder)
 static void CuvidReleaseSurface(CuvidDecoder * decoder, int surface)
 {
     int i;
-
+	if (decoder->frames[surface]) {
+		av_frame_free(&decoder->frames[surface]);
+	}
+#ifdef PLACEBO	
+	if (p->has_dma_buf) {
+		if (decoder->pl_images[surface].planes[0].texture) {
+			if (decoder->pl_images[surface].planes[0].texture->params.shared_mem.handle.fd) {
+				close(decoder->pl_images[surface].planes[0].texture->params.shared_mem.handle.fd);
+			}
+			pl_tex_destroy(p->gpu,&decoder->pl_images[surface].planes[0].texture);
+		}
+		if (decoder->pl_images[surface].planes[1].texture) {
+			if (decoder->pl_images[surface].planes[1].texture->params.shared_mem.handle.fd) {
+				close(decoder->pl_images[surface].planes[1].texture->params.shared_mem.handle.fd);
+			}
+			pl_tex_destroy(p->gpu,&decoder->pl_images[surface].planes[1].texture);
+		}
+	}
+#endif
     for (i = 0; i < decoder->SurfaceUsedN; ++i) {
 		if (decoder->SurfacesUsed[i] == surface) {
 			// no problem, with last used
@@ -2002,7 +2057,9 @@ int CuvidTestSurfaces() {
 ///
 static CuvidDecoder *CuvidNewHwDecoder(VideoStream * stream)
 {
+	
     CuvidDecoder *decoder;
+
     int i=0;
 
 //	setenv ("DISPLAY", ":0", 0); 
@@ -2012,17 +2069,26 @@ static CuvidDecoder *CuvidNewHwDecoder(VideoStream * stream)
 		Error(_("video/cuvid: out of decoders\n"));
 		return NULL;
     }
-	
+#ifdef CUVID	
     if ((i = av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_CUDA, X11DisplayName, NULL, 0)) != 0) {
 		Fatal("codec: can't allocate HW video codec context err %04x",i);
     }
+#endif
+#ifdef VAAPI
+	if ((i = av_hwdevice_ctx_create(&hw_device_ctx, AV_HWDEVICE_TYPE_VAAPI, "/dev/dri/renderD128" , NULL, 0)) != 0) {
+		Fatal("codec: can't allocate HW video codec context err %04x",i);
+    }
+#endif
     HwDeviceContext = av_buffer_ref(hw_device_ctx);
 	
     if (!(decoder = calloc(1, sizeof(*decoder)))) {
 		Error(_("video/cuvid: out of memory\n"));
 		return NULL;
     }
-
+#ifdef VAAPI	
+    VaDisplay = TO_VAAPI_DEVICE_CTX(HwDeviceContext)->display;
+    decoder->VaDisplay = VaDisplay;
+#endif
     decoder->Window = VideoWindow;
     //decoder->VideoX = 0;		// done by calloc
     //decoder->VideoY = 0;
@@ -2047,6 +2113,7 @@ static CuvidDecoder *CuvidNewHwDecoder(VideoStream * stream)
     decoder->OutputHeight = VideoWindowHeight;
     decoder->PixFmt = AV_PIX_FMT_NONE;
 
+
 #ifdef USE_AUTOCROP
     //decoder->AutoCropBuffer = NULL;	// done by calloc
     //decoder->AutoCropBufferSize = 0;
@@ -2219,6 +2286,10 @@ createTextureDst(CuvidDecoder * decoder,int anz, unsigned int size_x, unsigned i
     Debug(3,"video/vulkan: create %d Textures Format %s w %d h %d \n",anz,PixFmt==AV_PIX_FMT_NV12?"NV12":"P010",size_x,size_y);
   
 	for (i=0;i<anz;i++) {		// number of texture
+		if (decoder->frames[i]) {
+			av_frame_free(&decoder->frames[i]);
+			decoder->frames[i] = NULL;
+		}
 		for (n=0;n<2;n++ ) {  	// number of planes
 			bool ok = true;
 			if (PixFmt == AV_PIX_FMT_NV12) {
@@ -2228,7 +2299,17 @@ createTextureDst(CuvidDecoder * decoder,int anz, unsigned int size_x, unsigned i
 				fmt = pl_find_named_fmt(p->gpu, n==0?"r16":"rg16");	// 10 Bit YUV
 				size = 2;
 			}
-			decoder->pl_tex_in[i][n] = pl_tex_create(p->gpu, &(struct pl_tex_params) {
+			if (decoder->pl_images[i].planes[n].texture) {
+#ifdef VAAPI
+				if (p->has_dma_buf && decoder->pl_images[i].planes[n].texture->params.shared_mem.handle.fd) {
+					close(decoder->pl_images[i].planes[n].texture->params.shared_mem.handle.fd);
+				}
+#endif
+				pl_tex_destroy(p->gpu,&decoder->pl_images[i].planes[n].texture);  // delete old texture
+			}
+//			decoder->pl_tex_in[i][n] = pl_tex_create(p->gpu, &(struct pl_tex_params) {
+			if (p->has_dma_buf == 0) { 
+			     decoder->pl_images[i].planes[n].texture = pl_tex_create(p->gpu, &(struct pl_tex_params) {
 				.w = n==0?size_x:size_x/2,
 				.h = n==0?size_y:size_y/2,
 				.d = 0,
@@ -2238,9 +2319,10 @@ createTextureDst(CuvidDecoder * decoder,int anz, unsigned int size_x, unsigned i
 				.sample_mode = PL_TEX_SAMPLE_LINEAR,
 				.address_mode = PL_TEX_ADDRESS_CLAMP,
 				});
+			}
 			// make planes for image
 			pl = &decoder->pl_images[i].planes[n];
-			pl->texture = decoder->pl_tex_in[i][n];
+//			pl->texture = decoder->pl_tex_in[i][n];
 			pl->components = n==0?1:2;
 			pl->shift_x = 0.0f;
 			pl->shift_y = 0.0f;
@@ -2275,8 +2357,8 @@ createTextureDst(CuvidDecoder * decoder,int anz, unsigned int size_x, unsigned i
 		img->height = size_y;
 		img->num_overlays = 0;
 	}
-	
-	decoder->pl_buf_Y = pl_buf_create(p->gpu, &(struct pl_buf_params) {   // buffer for Y texture upload
+#ifdef CUVID	
+	decoder->pl_buf_Y[0] = pl_buf_create(p->gpu, &(struct pl_buf_params) {   // buffer for Y texture upload
 		.type = PL_BUF_TEX_TRANSFER,
 		.size = size_x * size_y * size,
 		.host_mapped = false,
@@ -2284,9 +2366,9 @@ createTextureDst(CuvidDecoder * decoder,int anz, unsigned int size_x, unsigned i
 		.memory_type = PL_BUF_MEM_DEVICE,
 		.handle_type = PL_HANDLE_FD,
 		});
-	decoder->ebuf[0].fd = dup(decoder->pl_buf_Y->shared_mem.handle.fd);		// dup fd
+	decoder->ebuf[0].fd = dup(decoder->pl_buf_Y[0]->shared_mem.handle.fd);		// dup fd
 
-	decoder->pl_buf_UV = pl_buf_create(p->gpu, &(struct pl_buf_params) {   // buffer for UV texture upload
+	decoder->pl_buf_UV[0] = pl_buf_create(p->gpu, &(struct pl_buf_params) {   // buffer for UV texture upload
 		.type = PL_BUF_TEX_TRANSFER,
 		.size = size_x * size_y * size / 2,
 		.host_mapped = false,
@@ -2294,18 +2376,18 @@ createTextureDst(CuvidDecoder * decoder,int anz, unsigned int size_x, unsigned i
 		.memory_type = PL_BUF_MEM_DEVICE,
 		.handle_type = PL_HANDLE_FD,
 		});
-	decoder->ebuf[1].fd = dup(decoder->pl_buf_UV->shared_mem.handle.fd);	// dup fd 
+	decoder->ebuf[1].fd = dup(decoder->pl_buf_UV[0]->shared_mem.handle.fd);	// dup fd 
 
 	CUDA_EXTERNAL_MEMORY_HANDLE_DESC ext_desc  = {
 		.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
 		.handle.fd   = decoder->ebuf[0].fd,
-		.size = decoder->pl_buf_Y->shared_mem.size,   // image_width * image_height * bytes,
+		.size = decoder->pl_buf_Y[0]->shared_mem.size,   // image_width * image_height * bytes,
 		.flags = 0,
 	};
 	checkCudaErrors(cuImportExternalMemory(&decoder->ebuf[0].mem, &ext_desc));	// Import Memory segment
 
 	CUDA_EXTERNAL_MEMORY_BUFFER_DESC buf_desc = {
-		.offset = decoder->pl_buf_Y->shared_mem.offset,
+		.offset = decoder->pl_buf_Y[0]->shared_mem.offset,
 		.size = size_x * size_y * size,
 		.flags = 0,
 	};
@@ -2314,37 +2396,181 @@ createTextureDst(CuvidDecoder * decoder,int anz, unsigned int size_x, unsigned i
 	CUDA_EXTERNAL_MEMORY_HANDLE_DESC ext_desc1 = {
 		.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
 		.handle.fd   = decoder->ebuf[1].fd,
-		.size = decoder->pl_buf_UV->shared_mem.size,   // image_width * image_height * bytes / 2,
+		.size = decoder->pl_buf_UV[0]->shared_mem.size,   // image_width * image_height * bytes / 2,
 		.flags = 0,
 	};
 	checkCudaErrors(cuImportExternalMemory(&decoder->ebuf[1].mem, &ext_desc1)); // Import Memory Segment  
 
 	CUDA_EXTERNAL_MEMORY_BUFFER_DESC buf_desc1 = {
-		.offset = decoder->pl_buf_UV->shared_mem.offset,
+		.offset = decoder->pl_buf_UV[0]->shared_mem.offset,
 		.size = size_x * size_y * size / 2,
 		.flags = 0,
 	};
 	checkCudaErrors(cuExternalMemoryGetMappedBuffer(&decoder->ebuf[1].buf, decoder->ebuf[1].mem, &buf_desc1));	// get pointer
+// ----------------------------
+		decoder->pl_buf_Y[1] = pl_buf_create(p->gpu, &(struct pl_buf_params) {   // buffer for Y texture upload
+		.type = PL_BUF_TEX_TRANSFER,
+		.size = size_x * size_y * size,
+		.host_mapped = false,
+		.host_writable = false,
+		.memory_type = PL_BUF_MEM_DEVICE,
+		.handle_type = PL_HANDLE_FD,
+		});
+	decoder->ebuf[2].fd = dup(decoder->pl_buf_Y[1]->shared_mem.handle.fd);		// dup fd
+
+	decoder->pl_buf_UV[1] = pl_buf_create(p->gpu, &(struct pl_buf_params) {   // buffer for UV texture upload
+		.type = PL_BUF_TEX_TRANSFER,
+		.size = size_x * size_y * size / 2,
+		.host_mapped = false,
+		.host_writable = false,
+		.memory_type = PL_BUF_MEM_DEVICE,
+		.handle_type = PL_HANDLE_FD,
+		});
+	decoder->ebuf[3].fd = dup(decoder->pl_buf_UV[1]->shared_mem.handle.fd);	// dup fd 
+
+	CUDA_EXTERNAL_MEMORY_HANDLE_DESC ext_desc2  = {
+		.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+		.handle.fd   = decoder->ebuf[2].fd,
+		.size = decoder->pl_buf_Y[1]->shared_mem.size,   // image_width * image_height * bytes,
+		.flags = 0,
+	};
+	checkCudaErrors(cuImportExternalMemory(&decoder->ebuf[2].mem, &ext_desc2));	// Import Memory segment
+
+	CUDA_EXTERNAL_MEMORY_BUFFER_DESC buf_desc2 = {
+		.offset = decoder->pl_buf_Y[1]->shared_mem.offset,
+		.size = size_x * size_y * size,
+		.flags = 0,
+	};
+	checkCudaErrors(cuExternalMemoryGetMappedBuffer(&decoder->ebuf[2].buf, decoder->ebuf[2].mem, &buf_desc2)); // get Pointer
+	
+	CUDA_EXTERNAL_MEMORY_HANDLE_DESC ext_desc3 = {
+		.type = CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD,
+		.handle.fd   = decoder->ebuf[3].fd,
+		.size = decoder->pl_buf_UV[1]->shared_mem.size,   // image_width * image_height * bytes / 2,
+		.flags = 0,
+	};
+	checkCudaErrors(cuImportExternalMemory(&decoder->ebuf[3].mem, &ext_desc3)); // Import Memory Segment  
+
+	CUDA_EXTERNAL_MEMORY_BUFFER_DESC buf_desc3 = {
+		.offset = decoder->pl_buf_UV[1]->shared_mem.offset,
+		.size = size_x * size_y * size / 2,
+		.flags = 0,
+	};
+	checkCudaErrors(cuExternalMemoryGetMappedBuffer(&decoder->ebuf[3].buf, decoder->ebuf[3].mem, &buf_desc3));	// get pointer
+#endif	
+}
+#ifdef VAAPI
+// copy image and process using CUDA
+void generateVAAPIImage(CuvidDecoder * decoder,int index, const AVFrame *frame,int image_width , int image_height)
+{
+    int n;
+	VAStatus status;
+	static int toggle = 0;
+	uint64_t first_time;
+	VADRMPRIMESurfaceDescriptor desc;
+	
+    status = vaExportSurfaceHandle(decoder->VaDisplay, (unsigned int)frame->data[3],
+                                   VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME_2,
+                                   VA_EXPORT_SURFACE_READ_ONLY |
+                                   VA_EXPORT_SURFACE_SEPARATE_LAYERS,
+                                   &desc);
+	
+	if (status != VA_STATUS_SUCCESS) {
+		printf("Fehler beim export VAAPI Handle\n");
+		return;
+	}
+	vaSyncSurface(decoder->VaDisplay,(unsigned int)frame->data[3]);
+	
+    for (n = 0; n < 2; n++) { 		//  Set DMA_BUF from VAAPI decoder to Textures 
+            int id = desc.layers[n].object_index[0];
+            int fd = desc.objects[id].fd;
+            uint32_t size = desc.objects[id].size;
+            uint32_t offset = desc.layers[n].offset[0];
+			const struct pl_fmt *fmt;
+			
+			if (fd == -1) {
+				printf("Fehler beim Import von Surface %d\n",index);
+				return;
+			}
+			
+			if (decoder->PixFmt == AV_PIX_FMT_NV12) {
+				fmt = pl_find_named_fmt(p->gpu, n==0?"r8":"rg8");	// 8 Bit YUV
+			} else {
+				fmt = pl_find_named_fmt(p->gpu, n==0?"r16":"rg16");	// 10 Bit YUV
+			}
+
+            struct pl_tex_params tex_params = {
+                .w = n==0?image_width:image_width/2,
+                .h = n==0?image_height:image_height/2,
+                .d = 0,
+                .format = fmt,
+                .sampleable = true,						
+				.host_writable = false,
+				.address_mode = PL_TEX_ADDRESS_CLAMP,				
+                .sample_mode =  PL_TEX_SAMPLE_LINEAR,                                                
+                .import_handle = PL_HANDLE_DMA_BUF,
+                .shared_mem = (struct pl_shared_mem) {
+                    .handle = {
+                        .fd = fd,
+                    },
+                    .size = size,
+                    .offset = offset,
+                },
+            };
+
+//printf("vor create  Object %d with fd %d import size %u offset  %d  %dx%d\n",id,fd,size,offset, tex_params.w,tex_params.h);
+
+			if (decoder->pl_images[index].planes[n].texture) {
+#if 0
+				if (decoder->pl_images[index].planes[n].texture->params.shared_mem.handle.fd) {
+					close(decoder->pl_images[index].planes[n].texture->params.shared_mem.handle.fd);
+					printf("close FD %d\n",decoder->pl_images[index].planes[n].texture->params.shared_mem.handle.fd);
+				}
+#endif
+				pl_tex_destroy(p->gpu,&decoder->pl_images[index].planes[n].texture);
+
+			}
+			decoder->pl_images[index].planes[n].texture = pl_tex_create(p->gpu, &tex_params);
+
+
+
+
+    }
+
+//	VideoThreadUnlock();
 
 }
-
-
+#endif
+#ifdef CUVID
 // copy image and process using CUDA
 void generateCUDAImage(CuvidDecoder * decoder,int index, const AVFrame *frame,int image_width , int image_height, int bytes)
 {
     int n;
- 
-	struct ext_buf ebuf[2];
+	static int toggle = 0;
+	uint64_t first_time;
+//	struct ext_buf ebuf[2];
+//first_time = GetusTicks();
+	VideoThreadLock();
+		
 //printf("Upload buf to texture for frame %d in size %d-%d\n",index,image_width,image_height);
-	if (decoder->pl_buf_Y)
-		while (pl_buf_poll(p->gpu,decoder->pl_buf_Y, 5000000));   //  5 ms
+	if (decoder->pl_buf_Y[toggle])
+		while (pl_buf_poll(p->gpu,decoder->pl_buf_Y[toggle], 000000)) {   //  5 ms
+			VideoThreadUnlock();
+			usleep(1);
+			VideoThreadLock();
+		}
 	else
 		return;
-	if (decoder->pl_buf_UV)
-		while (pl_buf_poll(p->gpu,decoder->pl_buf_UV, 5000000));
+	if (decoder->pl_buf_UV[toggle])
+		while (pl_buf_poll(p->gpu,decoder->pl_buf_UV[toggle], 000000)) {
+			VideoThreadUnlock();
+			usleep(1);
+			VideoThreadLock();
+		}
 	else
 		return;
-			
+//	printf("1 got Image buffers %2.2f\n",(float)(GetusTicks()-first_time)/1000000.0);	
+
     for (n = 0; n < 2; n++) { 									//  Copy 2 Planes from Cuda decoder to upload Buffer 
         // widthInBytes must account for the chroma plane
         // elements being two samples wide.
@@ -2356,28 +2582,31 @@ void generateCUDAImage(CuvidDecoder * decoder,int index, const AVFrame *frame,in
             .WidthInBytes  = image_width * bytes, 
             .Height        = n==0?image_height:image_height/2 , 
 			.dstMemoryType = CU_MEMORYTYPE_DEVICE,
-            .dstDevice 	   = decoder->ebuf[n].buf,
+            .dstDevice 	   = decoder->ebuf[toggle*2+n].buf,
             .dstPitch  	   = image_width * bytes,
         };
         checkCudaErrors(cuMemcpy2D(&cpy));        
     }
 
 	pl_tex_upload(p->gpu,&(struct pl_tex_transfer_params) {  	// upload Y 
-		.tex = decoder->pl_tex_in[index][0],
-		.buf = decoder->pl_buf_Y,
+		.tex = decoder->pl_images[index].planes[0].texture,
+		.buf = decoder->pl_buf_Y[toggle],
 	});
 	pl_tex_upload(p->gpu,&(struct pl_tex_transfer_params) {		// upload UV
-		.tex = decoder->pl_tex_in[index][1],
-		.buf = decoder->pl_buf_UV,
+		.tex = decoder->pl_images[index].planes[1].texture,
+		.buf = decoder->pl_buf_UV[toggle],
 	});
 
-	pl_buf_export(p->gpu,decoder->pl_buf_Y);
-	pl_buf_export(p->gpu,decoder->pl_buf_UV);
+	pl_buf_export(p->gpu,decoder->pl_buf_Y[toggle]);
+	pl_buf_export(p->gpu,decoder->pl_buf_UV[toggle]);
+//	toggle = toggle==0?1:0;
+//	pl_gpu_flush(p->gpu);
+	VideoThreadUnlock();
 
-//	pl_gpu_finish(p->gpu);
 }
-
+#endif
 #else
+#ifdef CUVID
 void
 createTextureDst(CuvidDecoder * decoder,int anz, unsigned int size_x, unsigned int size_y, enum AVPixelFormat PixFmt)
 {
@@ -2446,6 +2675,7 @@ void generateCUDAImage(CuvidDecoder * decoder,int index, const AVFrame *frame,in
     }
 }
 #endif
+#endif
 
 
 
@@ -2484,16 +2714,15 @@ static unsigned CuvidGetVideoSurface(CuvidDecoder * decoder,
     return CuvidGetVideoSurface0(decoder);
 }
 
-#ifdef YADIF
+
+#ifdef VAAPI
 static void CuvidSyncRenderFrame(CuvidDecoder * decoder,
     const AVCodecContext * video_ctx, const AVFrame * frame);
 
 int push_filters(AVCodecContext * dec_ctx,CuvidDecoder * decoder,AVFrame *frame) {
-	int ret;
+	int ret,i=0;
 	AVFrame *filt_frame = av_frame_alloc();
 	
-//	frame->pts = frame->best_effort_timestamp;
-
 	/* push the decoded frame into the filtergraph */
 	if (av_buffersrc_add_frame_flags(decoder->buffersrc_ctx, frame, AV_BUFFERSRC_FLAG_KEEP_REF) < 0) {
 		av_log(NULL, AV_LOG_ERROR, "Error while feeding the filtergraph\n");
@@ -2503,14 +2732,171 @@ int push_filters(AVCodecContext * dec_ctx,CuvidDecoder * decoder,AVFrame *frame)
 	/* pull filtered frames from the filtergraph */
 	while ((ret = av_buffersink_get_frame(decoder->buffersink_ctx, filt_frame)) >= 0) {
 //		filt_frame->pts = frame->pts;						// Restore orginal pts 
-//		frame->pts += 20 * 90;									// prepare for next frame
+//		filt_frame->pts += 20 * 90;									// prepare for next frame
+		filt_frame->pts /= 2;
+		decoder->Interlaced = 0;
+//		printf("vaapideint video:new  %#012" PRIx64 " old %#012" PRIx64 "\n",filt_frame->pts,frame->pts);		
+		CuvidSyncRenderFrame(decoder, dec_ctx, filt_frame);
+		if (i++ == 0)
+		  filt_frame = av_frame_alloc();  // get new frame
+//		av_frame_unref(filt_frame);
+	}
+
+//	av_frame_free(&filt_frame);
+	av_frame_free(&frame);
+	return ret;
+}
+
+
+int init_filters(AVCodecContext * dec_ctx,CuvidDecoder * decoder,AVFrame *frame)
+{	
+#ifdef VAAPI
+	const char *filters_descr = "deinterlace_vaapi=rate=field:auto=1";   //
+	enum AVPixelFormat format = AV_PIX_FMT_VAAPI;
+#endif
+#ifdef YADIF
+	const char *filters_descr = "yadif_cuda=1:0:1";   // mode=send_field,parity=tff,deint=interlaced";
+	enum AVPixelFormat format = AV_PIX_FMT_CUDA;
+#endif
+    char args[512];
+    int ret = 0;
+    const AVFilter *buffersrc  = avfilter_get_by_name("buffer");
+    const AVFilter *buffersink = avfilter_get_by_name("buffersink");
+    AVFilterInOut *outputs = avfilter_inout_alloc();
+    AVFilterInOut *inputs  = avfilter_inout_alloc();
+	AVBufferSrcParameters *src_params;
+
+    enum AVPixelFormat pix_fmts[] = { format, AV_PIX_FMT_NONE };
+
+	if (decoder->filter_graph)
+		avfilter_graph_free(&decoder->filter_graph);
+	
+    decoder->filter_graph = avfilter_graph_alloc();
+    if (!outputs || !inputs || !decoder->filter_graph) {
+        ret = AVERROR(ENOMEM);
+        goto end;
+    }
+
+    /* buffer video source: the decoded frames from the decoder will be inserted here. */
+    snprintf(args, sizeof(args),
+            "video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:pixel_aspect=%d/%d",
+            dec_ctx->width, dec_ctx->height, format,
+            1, 90000,
+            dec_ctx->sample_aspect_ratio.num, dec_ctx->sample_aspect_ratio.den);
+
+    ret = avfilter_graph_create_filter(&decoder->buffersrc_ctx, buffersrc, "in",
+                                       args, NULL, decoder->filter_graph);
+    if (ret < 0) {
+        Debug(3, "Cannot create buffer source\n");
+        goto end;
+    }
+	src_params = av_buffersrc_parameters_alloc();
+	src_params->hw_frames_ctx = frame->hw_frames_ctx;
+	src_params->format = format;
+	src_params->time_base.num = 1;
+	src_params->time_base.den = 90000;
+	src_params->width = dec_ctx->width;
+	src_params->height = dec_ctx->height;
+	src_params->frame_rate.num = 50;
+	src_params->frame_rate.den = 1;
+	src_params->sample_aspect_ratio = dec_ctx->sample_aspect_ratio;
+	
+//printf("width %d height %d hw_frames_ctx %p\n",dec_ctx->width,dec_ctx->height ,frame->hw_frames_ctx);
+	ret = av_buffersrc_parameters_set(decoder->buffersrc_ctx, src_params);
+	if (ret < 0) {
+        Debug(3, "Cannot set hw_frames_ctx to src\n");
+        goto end;
+    }
+    /* buffer video sink: to terminate the filter chain. */
+    ret = avfilter_graph_create_filter(&decoder->buffersink_ctx, buffersink, "out",
+                                       NULL, NULL, decoder->filter_graph);
+    if (ret < 0) {
+        Debug(3, "Cannot create buffer sink\n");
+        goto end;
+    }
+#if 0
+    ret = av_opt_set_int_list(decoder->buffersink_ctx, "pix_fmts", pix_fmts,
+                              AV_PIX_FMT_NONE , AV_OPT_SEARCH_CHILDREN);
+    if (ret < 0) {
+        Debug(3, "Cannot set output pixel format\n");
+        goto end;
+    }
+#endif
+    /*
+     * Set the endpoints for the filter graph. The filter_graph will
+     * be linked to the graph described by filters_descr.
+     */
+
+    /*
+     * The buffer source output must be connected to the input pad of
+     * the first filter described by filters_descr; since the first
+     * filter input label is not specified, it is set to "in" by
+     * default.
+     */
+    outputs->name       = av_strdup("in");
+    outputs->filter_ctx = decoder->buffersrc_ctx;
+    outputs->pad_idx    = 0;
+    outputs->next       = NULL;
+
+    /*
+     * The buffer sink input must be connected to the output pad of
+     * the last filter described by filters_descr; since the last
+     * filter output label is not specified, it is set to "out" by
+     * default.
+     */
+    inputs->name       = av_strdup("out");
+    inputs->filter_ctx = decoder->buffersink_ctx;
+    inputs->pad_idx    = 0;
+    inputs->next       = NULL;
+
+    if ((ret = avfilter_graph_parse_ptr(decoder->filter_graph, filters_descr, &inputs, &outputs, NULL)) < 0) {
+		Debug(3,"Cannot set graph parse %d\n",ret);
+        goto end;
+	}
+
+    if ((ret = avfilter_graph_config(decoder->filter_graph, NULL)) < 0) {
+		Debug(3,"Cannot set graph config %d\n",ret);
+        goto end;
+	}
+
+end:
+    avfilter_inout_free(&inputs);
+    avfilter_inout_free(&outputs);
+
+    return ret;
+}
+
+#endif
+
+
+#ifdef YADIF
+static void CuvidSyncRenderFrame(CuvidDecoder * decoder,
+    const AVCodecContext * video_ctx, const AVFrame * frame);
+
+int push_filters(AVCodecContext * dec_ctx,CuvidDecoder * decoder,AVFrame *frame) {
+	int ret,i;
+	AVFrame *filt_frame = av_frame_alloc();
+	
+//	frame->pts = frame->best_effort_timestamp;
+
+	/* push the decoded frame into the filtergraph */
+	if (av_buffersrc_add_frame_flags(decoder->buffersrc_ctx, frame, AV_BUFFERSRC_FLAG_KEEP_REF) < 0) {
+		av_log(NULL, AV_LOG_ERROR, "Error while feeding the filtergraph\n");
+	}
+    av_frame_copy_props(filt_frame,frame);
+//printf("Interlaced %d tff %d\n",frame->interlaced_frame,frame->top_field_first);
+	/* pull filtered frames from the filtergraph */
+	while ((ret = av_buffersink_get_frame(decoder->buffersink_ctx, filt_frame)) >= 0) {
 		filt_frame->pts /= 2;
 //		Debug(3,"video:new  %#012" PRIx64 " old %#012" PRIx64 "\n",filt_frame->pts,frame->pts);
 		CuvidSyncRenderFrame(decoder, dec_ctx, filt_frame);
-		av_frame_unref(filt_frame);
+		if (i++ == 0)
+		  filt_frame = av_frame_alloc();  // get new frame
+//		av_frame_unref(filt_frame);
 	}
 
-	av_frame_free(&filt_frame);
+//	av_frame_free(&filt_frame);
+	av_frame_free(&frame);
 	return ret;
 }
 
@@ -2656,13 +3042,20 @@ static enum AVPixelFormat Cuvid_get_format(CuvidDecoder * decoder,
         if (*fmt_idx == AV_PIX_FMT_P010LE)
             bitformat16 = 1;
     }
+#ifdef VAAPI
+	if (video_ctx->profile == FF_PROFILE_HEVC_MAIN_10)
+		bitformat16 = 1;
+#endif
 
     Debug(3, "%s: codec %d fmts:\n", __FUNCTION__, video_ctx->codec_id);
     for (fmt_idx = fmt; *fmt_idx != AV_PIX_FMT_NONE; fmt_idx++) {
 		Debug(3, "\t%#010x %s\n", *fmt_idx, av_get_pix_fmt_name(*fmt_idx));
 		// check supported pixel format with entry point
 		switch (*fmt_idx) {
+#ifdef CUVID
 			case AV_PIX_FMT_CUDA:
+#endif
+			case AV_PIX_FMT_VAAPI:
 				break;
 			default:
 				continue;
@@ -2672,20 +3065,32 @@ static enum AVPixelFormat Cuvid_get_format(CuvidDecoder * decoder,
 	
 	Debug(3,"video profile %d codec id %d\n",video_ctx->profile,video_ctx->codec_id);
     if (*fmt_idx == AV_PIX_FMT_NONE) {
-		Error(_("video: no valid pixfmt found\n"));
+		Fatal(_("video: no valid pixfmt found\n"));
     }
 
-
+#ifdef CUVID
     if (*fmt_idx != AV_PIX_FMT_CUDA) {
 		Fatal(_("video: no valid profile found\n"));
     }
     if (ist->GetFormatDone)
 		return AV_PIX_FMT_CUDA;
+#endif
+#ifdef VAAPI
+	if (*fmt_idx != AV_PIX_FMT_VAAPI) {
+		Fatal(_("video: no valid profile found\n"));
+    }
+    if (ist->GetFormatDone)
+		return AV_PIX_FMT_VAAPI;
+#endif
 	ist->GetFormatDone = 1;
 	
     Debug(3, "video: create decoder 16bit?=%d %dx%d old  %d %d\n",bitformat16, video_ctx->width, video_ctx->height,decoder->InputWidth,decoder->InputHeight);
-	
+#ifdef CUVID
     if (*fmt_idx == AV_PIX_FMT_CUDA  )  {       // HWACCEL used 
+#endif
+#ifdef VAAPI
+	if (*fmt_idx == AV_PIX_FMT_VAAPI) {         // HWACCEL used 
+#endif
 		//	Check image, format, size
 		// 
 		if (bitformat16) { 
@@ -2695,10 +3100,10 @@ static enum AVPixelFormat Cuvid_get_format(CuvidDecoder * decoder,
 			decoder->PixFmt = AV_PIX_FMT_NV12;        // 8 Bit Planar
 			ist->hwaccel_output_format = AV_PIX_FMT_NV12;
         }
-#if 1
+
 	if (1 || video_ctx->width  != decoder->InputWidth
 		|| video_ctx->height != decoder->InputHeight) { 
-
+			VideoThreadLock();
 			CuvidCleanup(decoder);
 			decoder->InputAspect = video_ctx->sample_aspect_ratio;
 			decoder->InputWidth = video_ctx->width;
@@ -2707,6 +3112,7 @@ static enum AVPixelFormat Cuvid_get_format(CuvidDecoder * decoder,
 			
 			decoder->SurfacesNeeded = VIDEO_SURFACES_MAX + 1;
 			CuvidSetupOutput(decoder);
+			VideoThreadUnlock();
 #ifdef PLACEBO     // dont show first frame
 			decoder->newchannel = 1;
 #endif
@@ -2726,14 +3132,20 @@ static enum AVPixelFormat Cuvid_get_format(CuvidDecoder * decoder,
 #endif
 		} 
 
-#endif	
   		CuvidMessage(2,"CUVID Init ok %dx%d\n",video_ctx->width,video_ctx->height);
+		decoder->InputAspect = video_ctx->sample_aspect_ratio;
+#ifdef CUVID
         ist->active_hwaccel_id = HWACCEL_CUVID;
         ist->hwaccel_pix_fmt   = AV_PIX_FMT_CUDA;
+		return AV_PIX_FMT_CUDA;
+#endif
+#ifdef VAAPI
+		ist->filter = 1;  // init deint vaapi
+        ist->active_hwaccel_id = HWACCEL_VAAPI;
+        ist->hwaccel_pix_fmt   = AV_PIX_FMT_VAAPI;
+		return AV_PIX_FMT_VAAPI;
+#endif
  
-        decoder->InputAspect = video_ctx->sample_aspect_ratio;
-
-        return AV_PIX_FMT_CUDA;
     }
 	Fatal(_("NO Format valid"));
     return *fmt_idx;
@@ -2944,7 +3356,7 @@ static uint8_t *CuvidGrabOutputSurfaceLocked(int *ret_size, int *ret_width, int
 	height = decoder->InputHeight;
 #endif
     
-    Debug(3, "video/cuvid: grab %dx%d\n", width, height);
+//    Debug(3, "video/cuvid: grab %dx%d\n", width, height);
 
     source_rect.x0 = 0;
     source_rect.y0 = 0;
@@ -2977,9 +3389,7 @@ static uint8_t *CuvidGrabOutputSurfaceLocked(int *ret_size, int *ret_width, int
 			}
 		}
 
-		Debug(3, "video/cuvid: grab source rect %d,%d:%d,%d dest dim %dx%d\n",
-			source_rect.x0, source_rect.y0, source_rect.x1, source_rect.y1,
-			width, height);
+//		printf("video/cuvid: grab source  dim %dx%d\n", width, height);
 
 		size = width * height * sizeof(uint32_t);
 		
@@ -3000,7 +3410,7 @@ static uint8_t *CuvidGrabOutputSurfaceLocked(int *ret_size, int *ret_width, int
 		while(decoder->grab) {
 			usleep(1000);				// wait for data
 		}
-		Debug(3,"got grab data\n");
+//		Debug(3,"got grab data\n");
 
 		if (ret_size) {
 			*ret_size = size;
@@ -3285,11 +3695,13 @@ static void CuvidQueueVideoSurface(CuvidDecoder * decoder, int surface, int soft
     atomic_inc(&decoder->SurfacesFilled);
 }
 
+#if 0
 extern void Nv12ToBgra32(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix,cudaStream_t stream);
 extern void P016ToBgra32(uint8_t *dpNv12, int nNv12Pitch, uint8_t *dpBgra, int nBgraPitch, int nWidth, int nHeight, int iMatrix,cudaStream_t stream);
 extern void ResizeNv12(unsigned char *dpDstNv12, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcNv12, int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char* dpDstNv12UV);
 extern void ResizeP016(unsigned char *dpDstP016, int nDstPitch, int nDstWidth, int nDstHeight, unsigned char *dpSrcP016, int nSrcPitch, int nSrcWidth, int nSrcHeight, unsigned char* dpDstP016UV);
 extern void cudaLaunchNV12toARGBDrv(uint32_t *d_srcNV12, size_t nSourcePitch,uint32_t *d_dstARGB, size_t nDestPitch,uint32_t width, uint32_t height,CUstream streamID);
+#endif	
 void VideoSetAbove();
 ///
 ///	Render a ffmpeg frame.
@@ -3299,11 +3711,19 @@ void VideoSetAbove();
 ///	@param frame		frame to display
 ///
 static void CuvidRenderFrame(CuvidDecoder * decoder,
-    const AVCodecContext * video_ctx, const AVFrame * frame)
+    const AVCodecContext * video_ctx, AVFrame * frame)
 {
+	uint64_t first_time;
     int surface;
 	enum AVColorSpace color;
 
+#if 1
+	if (skipwait > 1) {
+		skipwait--;
+		return;
+	}
+#endif	
+
     // update aspect ratio changes
 #if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(53,60,100)
     if (decoder->InputWidth && decoder->InputHeight
@@ -3351,7 +3771,12 @@ static void CuvidRenderFrame(CuvidDecoder * decoder,
 	//	Copy data from frame to image
 	//
 	
+#ifdef CUVID	
 	if (video_ctx->pix_fmt == AV_PIX_FMT_CUDA) { 
+#endif
+#ifdef VAAPI
+	if (video_ctx->pix_fmt == AV_PIX_FMT_VAAPI) { 
+#endif
 		int w = decoder->InputWidth;
 		int h = decoder->InputHeight;
 
@@ -3361,45 +3786,68 @@ static void CuvidRenderFrame(CuvidDecoder * decoder,
 		
 		surface = CuvidGetVideoSurface0(decoder);
 		
-		if (surface == -1)     // no free surfaces
+		if (surface == -1)  {   // no free surfaces
+			Debug(3,"no more surfaces\n");
 			return;	
-
-#if 0   // old copy via host ram
+		}
+		
+	if (!decoder->Closing) {
+		VideoSetPts(&decoder->PTS, decoder->Interlaced, video_ctx, frame);
+	}
+				
+#ifdef VAAPI    // old copy via host ram
 		{
-			AVFrame *output;
-			int t = decoder->PixFmt==AV_PIX_FMT_NV12?1:2;
-			struct pl_rect3d rc1 = {0,0,0,w,h,0};
+		AVFrame *output;
+		VideoThreadLock();
+		int t = decoder->PixFmt==AV_PIX_FMT_NV12?1:2;
+		struct pl_rect3d rc1 = {0,0,0,w,h,0};
+
+		if (p->has_dma_buf) {  // Vulkan supports DMA_BUF no copy required
+			generateVAAPIImage(decoder,surface,frame,w,h);
+		}
+		else {   // we need to Copy the frame via RAM
+			vaSyncSurface(decoder->VaDisplay,(unsigned int)frame->data[3]);
 			output = av_frame_alloc();
+	//			av_frame_ref(output,frame);
 			av_hwframe_transfer_data(output,frame,0);
-			av_frame_copy_props(output,frame);		
+			av_frame_copy_props(output,frame);	
+//				printf("Save Surface ID %d %p %p\n",surface,decoder->pl_images[surface].planes[0].texture,decoder->pl_images[surface].planes[1].texture);
 			bool ok = pl_tex_upload(p->gpu,&(struct pl_tex_transfer_params) {
-				.tex = decoder->pl_tex_in[surface][0],
-				.stride_w = output->linesize[0] / t,
+				.tex = decoder->pl_images[surface].planes[0].texture,
+				.stride_w = output->linesize[0], 
+				.stride_h = h,
 				.ptr = output->data[0],
 				.rc.x1 = w,
 				.rc.y1 = h,
 				.rc.z1 = 0,
 			});
 			ok &= pl_tex_upload(p->gpu,&(struct pl_tex_transfer_params) {
-				.tex = decoder->pl_tex_in[surface][1],
-				.stride_w = (output->linesize[1] / 2) / t,
+				.tex = decoder->pl_images[surface].planes[1].texture,
+				.stride_w = output->linesize[0]/2, 
+				.stride_h = h/2,
 				.ptr = output->data[1],
 				.rc.x1 = w/2,					
 				.rc.y1 = h/2,
 				.rc.z1 = 0,
 			});
-
 			av_frame_free(&output);
 		}
+		VideoThreadUnlock();
+		}
 #endif
+#ifdef CUVID
+//		first_time = GetusTicks();
 		
 		// copy to texture
 		generateCUDAImage(decoder,surface,frame,w,h,decoder->PixFmt==AV_PIX_FMT_NV12?1:2);
-			
+//			printf("generate CUDA Image  %d\n",(GetusTicks()-first_time)/1000000);
+#endif
 		CuvidQueueVideoSurface(decoder, surface, 1);
+		decoder->frames[surface] = frame; 
 		return;      
 
 	}
+	
 	Fatal(_("video/vdpau: pixel format %d not supported\n"),video_ctx->pix_fmt);
 }
 
@@ -3414,6 +3862,7 @@ static void *CuvidGetHwAccelContext(CuvidDecoder * decoder)
 
 	Debug(3, "Initializing cuvid hwaccel thread ID:%ld\n",(long int)syscall(186));
 //turn NULL;
+#ifdef CUVID
 	if (decoder->cuda_ctx) {
 		Debug(3,"schon passiert\n");
 		return NULL;
@@ -3435,7 +3884,7 @@ static void *CuvidGetHwAccelContext(CuvidDecoder * decoder)
 
     cuCtxGetApiVersion(decoder->cuda_ctx,&version);
 	Debug(3, "***********CUDA API Version %d\n",version);
-
+#endif
 	return NULL;
 
 }
@@ -3482,6 +3931,7 @@ static void CuvidAdvanceDecoderFrame(CuvidDecoder * decoder)
 //				VideoGetBuffers(decoder->Stream));
 				return;
 			}
+
 			decoder->SurfaceRead = (decoder->SurfaceRead + 1) % VIDEO_SURFACES_MAX;
 			atomic_dec(&decoder->SurfacesFilled);
 			decoder->SurfaceField = !decoder->Interlaced;
@@ -3695,6 +4145,7 @@ static void CuvidMixVideo(CuvidDecoder * decoder, __attribute__((unused))int lev
 	
 	if (decoder->newchannel && current == 0 ) {
 		colors.brightness = -1.0f;
+		colors.contrast = 0.0f;
 		if (!pl_render_image(p->renderer, &decoder->pl_images[current], target, &render_params)) {
 		   Debug(3,"Failed rendering frame!\n");
 		}
@@ -3797,7 +4248,7 @@ void make_osd_overlay(int x, int y, int width, int height) {
 static void CuvidDisplayFrame(void)
 {
 
-	uint64_t first_time;
+	static uint64_t first_time = 0;
 	static uint64_t last_time = 0;
 	int i;
 	static unsigned int Count;
@@ -3808,9 +4259,11 @@ static void CuvidDisplayFrame(void)
   
 #ifdef PLACEBO
 	uint64_t diff;
+	static float fdiff = 23000.0;
 	struct pl_swapchain_frame frame;
 	struct pl_render_target target;
 	bool ok;
+	static int first = 1;
 	VkImage Image;
 	const struct pl_fmt *fmt; 
 	const float black[4] = { 0.0f,0.0f,0.0f,1.0f};
@@ -3822,46 +4275,59 @@ static void CuvidDisplayFrame(void)
 	if (CuvidDecoderN)
 		CuvidDecoders[0]->Frameproc = (float)(GetusTicks()-last_time)/1000000.0;
 //	printf("Time used %2.2f\n",CuvidDecoders[0]->Frameproc);
-#endif
-	
-#ifndef PLACEBO
+
     glXWaitVideoSyncSGI (2, (Count + 1) % 2, &Count);   // wait for previous frame to swap
 	last_time = GetusTicks();	
 	glClear(GL_COLOR_BUFFER_BIT);
 	
 #else
-	if (CuvidDecoderN)
-		CuvidDecoders[0]->Frameproc = (float)(GetusTicks()-last_time)/1000000.0;
-#if 0
-	diff = (GetusTicks()-last_time)/1000000;
-//	printf("Time wait %2.2f \n",(float)(diff));
-	last_time = GetusTicks();
-	if (diff < 19) {
-//		printf("Sleep %d\n",19-diff);
-//		usleep((18 - diff) * 1000);
-	}
-//	usleep(15000);
-//	printf("sleept %d\n",(GetusTicks()-last_time)/1000000);
-	
+//	if (CuvidDecoderN) {
+//		CuvidDecoders[0]->Frameproc = (float)(GetusTicks()-last_time)/1000000.0;
+//	}
+#if 1
+	diff = (GetusTicks()-last_time)/1000; //000;
+
 //	last_time = GetusTicks();
+//printf("Roundtrip %d\n",diff);
+	if (diff < 15000 && skipwait != 1) {
+//		printf("Sleep %d\n",15000-diff);
+		usleep((15000 - diff));// * 1000);
+	} else if (skipwait != 1) {
+	//	usleep(15000);
+	}
+
 #endif	
 	if (!p->swapchain)
 		return;
 	
-//	pl_swapchain_swap_buffers(p->swapchain);  // swap buffers
+//last_time = GetusTicks();
+	VideoThreadLock();
 
-//	printf(" Latency %d  \n",pl_swapchain_latency(p->swapchain));
-//	last_time = GetusTicks();
-
-
-	pthread_mutex_unlock(&VideoLockMutex);
+	if (!first) {
+//		last_time = GetusTicks();
+		if (!pl_swapchain_submit_frame(p->swapchain))
+			Fatal(_("Failed to submit swapchain buffer\n"));	
+		pl_swapchain_swap_buffers(p->swapchain);  // swap buffers
+//		printf("submit and swap %d\n",(GetusTicks()-last_time)/1000000);
+	}
+	
+	first = 0;
+#if 0	
+ 	fdiff = (float)(GetusTicks()-first_time)/1000.0;
+	if (fdiff > 20100.0 || fdiff < 19900.0)
+		printf("roundtrip %2.2f\n",fdiff);
+	first_time = GetusTicks();	
+#endif	
 
+	last_time = GetusTicks();
+	
 	while (!pl_swapchain_start_frame(p->swapchain, &frame)) {   // get new frame wait for previous to swap
 		usleep(5);
 	}
-	pthread_mutex_lock(&VideoLockMutex);
 
-last_time = GetusTicks();
+//printf("wait for frame %d\n",(GetusTicks()-last_time)/1000000);
+
+
 	if (!frame.fbo)
 		return;
 	pl_render_target_from_swapchain(&target, &frame);  // make target frame
@@ -3933,7 +4399,9 @@ last_time = GetusTicks();
 			}
 			continue;
 		}
-#ifdef PLACEBO
+		
+#ifdef PLACEBO	
+
 		if (OsdShown == 1) {  // New OSD opened
 			pthread_mutex_lock(&OSDMutex);	
 			make_osd_overlay(OSDx,OSDy,OSDxsize,OSDysize);
@@ -3947,12 +4415,14 @@ last_time = GetusTicks();
 			pthread_mutex_unlock(&OSDMutex);
 
 		}
+		
 		if (OsdShown == 2) {	
 			CuvidMixVideo(decoder, i, &target, &osdoverlay);
 		} else {	
 
 			CuvidMixVideo(decoder, i, &target, NULL);
 		}	
+		
 #else
 		CuvidMixVideo(decoder, i);
 #endif
@@ -3969,6 +4439,8 @@ last_time = GetusTicks();
 			decoder->grab = 0;
 		}
 	}
+	
+	
 	//
 #ifndef PLACEBO	   
 	//	add osd to surface
@@ -3994,15 +4466,12 @@ last_time = GetusTicks();
 #endif	
 
 #ifdef PLACEBO
-	if (!pl_swapchain_submit_frame(p->swapchain))
-		Fatal(_("Failed to submit swapchain buffer\n"));	
-	pl_swapchain_swap_buffers(p->swapchain);  // swap buffers
+	
+	VideoThreadUnlock();
 #else
 	glXGetVideoSyncSGI (&Count);    // get current frame
 	glXSwapBuffers(XlibDisplay, VideoWindow);
 #endif
-
-	
 	
 	// FIXME: CLOCK_MONOTONIC_RAW
 	clock_gettime(CLOCK_MONOTONIC, &CuvidFrameTime);
@@ -4011,6 +4480,7 @@ last_time = GetusTicks();
 		CuvidDecoders[i]->FrameTime = CuvidFrameTime;
 	}
 
+
 }
 
 ///
@@ -4135,7 +4605,7 @@ static void CuvidSyncDecoder(CuvidDecoder * decoder)
 		goto skip_sync;
 	}
 	audio_clock = AudioGetClock();
-
+//printf("Diff %d %ld %ld   filled %d \n",(video_clock - audio_clock - VideoAudioDelay)/90,video_clock,audio_clock,filled);
 	// 60Hz: repeat every 5th field
 	if (Video60HzMode && !(decoder->FramesDisplayed % 6)) {
 			if (audio_clock == (int64_t) AV_NOPTS_VALUE || video_clock == (int64_t) AV_NOPTS_VALUE) {
@@ -4182,8 +4652,21 @@ static void CuvidSyncDecoder(CuvidDecoder * decoder)
 		diff = video_clock - audio_clock - VideoAudioDelay;
 		diff = (decoder->LastAVDiff + diff) / 2;
 		decoder->LastAVDiff = diff;
-//if (abs(diff/90) > 100)
-//  printf("Diff %d\n",diff/90);
+		decoder->Frameproc = diff/90;
+#if 1		
+	if (skipwait <= 1) {
+		if ((diff/90) > 55) {
+			skipwait = 1;
+		} else if ((diff/90) < -25) {
+			skipwait = 3;
+		} else {
+			skipwait = 0;
+		}
+	}
+#endif
+		skipwait =0;
+//  printf("Diff %d filled %d skipwait %d\n",diff/90,filled,skipwait);
+
 		if (abs(diff) > 5000 * 90) {	// more than 5s
 			err = CuvidMessage(2, "video: audio/video difference too big\n");
 			decoder->SyncCounter = 1;
@@ -4199,13 +4682,15 @@ static void CuvidSyncDecoder(CuvidDecoder * decoder)
 			++decoder->FramesDuped;
 			decoder->SyncCounter = 1;
 			goto out;
-		} else if (diff < -25 * 90 && filled > 3 + 2 * decoder->Interlaced) {
+		} else if (diff < -25 * 90)  {
 		    err = CuvidMessage(3, "video: speed up video, droping frame\n");
 			++decoder->FramesDropped;
 			CuvidAdvanceDecoderFrame(decoder);
+			if (filled > 2 && (diff < -50 * 90))
+				CuvidAdvanceDecoderFrame(decoder);
 	//	    filled = atomic_read(&decoder->SurfacesFilled);
 //			Debug(3,"hinter drop frame filled %d\n",atomic_read(&decoder->SurfacesFilled));
-			decoder->SyncCounter = 1;;;
+			decoder->SyncCounter = 1;
 		}
 #if defined(DEBUG) || defined(AV_INFO)
 		if (!decoder->SyncCounter && decoder->StartCounter < 1000) {
@@ -4322,9 +4807,9 @@ static void CuvidSyncRenderFrame(CuvidDecoder * decoder,
 		return;
 	}
 
-	if (!decoder->Closing) {
-		VideoSetPts(&decoder->PTS, decoder->Interlaced, video_ctx, frame);
-	}
+//	if (!decoder->Closing) {
+//		VideoSetPts(&decoder->PTS, decoder->Interlaced, video_ctx, frame);
+//	}
 	CuvidRenderFrame(decoder, video_ctx, frame);
 }
 
@@ -4383,10 +4868,6 @@ Debug(3,"Set video mode %dx%d\n",VideoWindowWidth,VideoWindowHeight);
 	}
 }
 
-
-
-
-
 ///
 ///	Handle a CUVID display.
 ///
@@ -4403,7 +4884,9 @@ static void CuvidDisplayHandlerThread(void)
 	allfull = 1;
 	decoded = 0;
 	 
+#ifndef PLACEBO	
 	pthread_mutex_lock(&VideoLockMutex);
+#endif
 	for (i = 0; i < CuvidDecoderN; ++i) {
 
 		decoder = CuvidDecoders[i];
@@ -4412,7 +4895,7 @@ static void CuvidDisplayHandlerThread(void)
 		//
 		filled = atomic_read(&decoder->SurfacesFilled);
 //if (filled <= 1 +  2 * decoder->Interlaced) {
-		if (filled < 5) {
+		if (filled < 4) {
 			// FIXME: hot polling
 			// fetch+decode or reopen
 			allfull = 0;
@@ -4431,22 +4914,24 @@ static void CuvidDisplayHandlerThread(void)
 				}
 			}
 #ifdef PLACEBO
-//	usleep(1000);
+	usleep(1000);
 #endif
 			continue;
 		}
 		decoded = 1;
 	}
-	 
+	
+#ifndef PLACEBO
 	pthread_mutex_unlock(&VideoLockMutex);
-#if 1	 
+#endif
+	 
 	if (!decoded) {			// nothing decoded, sleep
 	// FIXME: sleep on wakeup
-		usleep(1 * 100);
+		usleep(1 * 1000);
 	}
-#endif
+	
 #ifdef PLACEBO
-	usleep(100);
+	usleep(1000);
 #endif
 
 	// all decoder buffers are full
@@ -4958,33 +5443,32 @@ static void VideoEvent(void)
 	case KeyPress:
 	    VideoThreadLock();
 	    letter_len =
-		XLookupString(&event.xkey, letter, sizeof(letter) - 1, &keysym,
-		NULL);
+		XLookupString(&event.xkey, letter, sizeof(letter) - 1, &keysym,	NULL);
 	    VideoThreadUnlock();
 	    if (letter_len < 0) {
-		letter_len = 0;
+			letter_len = 0;
 	    }
 	    letter[letter_len] = '\0';
 	    if (keysym == NoSymbol) {
-		Warning(_("video/event: No symbol for %d\n"),
+			Warning(_("video/event: No symbol for %d\n"),
 		    event.xkey.keycode);
-		break;
+			break;
 	    }
 	    VideoThreadLock();
 	    keynam = XKeysymToString(keysym);
 	    VideoThreadUnlock();
 	    // check for key modifiers (Alt/Ctrl)
 	    if (event.xkey.state & (Mod1Mask | ControlMask)) {
-		if (event.xkey.state & Mod1Mask) {
-		    strcpy(buf, "Alt+");
-		} else {
-		    buf[0] = '\0';
-		}
-		if (event.xkey.state & ControlMask) {
-		    strcat(buf, "Ctrl+");
-		}
-		strncat(buf, keynam, sizeof(buf) - 10);
-		keynam = buf;
+			if (event.xkey.state & Mod1Mask) {
+				strcpy(buf, "Alt+");
+			} else {
+				buf[0] = '\0';
+			}
+			if (event.xkey.state & ControlMask) {
+				strcat(buf, "Ctrl+");
+			}
+			strncat(buf, keynam, sizeof(buf) - 10);
+			keynam = buf;
 	    }
 	    FeedKeyPress("XKeySym", keynam, 0, 0, letter);
 	    break;
@@ -5043,28 +5527,33 @@ void VideoSetVideoEventCallback(void (*videoEventCallback)(void))
 
 #ifdef USE_VIDEO_THREAD
 
+	
+static uint64_t test_time=0;
 ///
 ///	Lock video thread.
 ///
-static void VideoThreadLock(void)
+void VideoThreadLock(void)
 {
 
     if (VideoThread) {
 		if (pthread_mutex_lock(&VideoLockMutex)) {
 			Error(_("video: can't lock thread\n"));
 		}
+//		test_time = GetusTicks();
+//		printf("Lock start....");
     }
 }
 
 ///
 ///	Unlock video thread.
 ///
-static void VideoThreadUnlock(void)
+void VideoThreadUnlock(void)
 {
     if (VideoThread) {
 		if (pthread_mutex_unlock(&VideoLockMutex)) {
 			Error(_("video: can't unlock thread\n"));
 		}
+//		printf("Video Locked for  %d\n",(GetusTicks()-test_time)/1000000);
 	}
 }
 #ifdef PLACEBO
@@ -5089,21 +5578,15 @@ void InitPlacebo(){
 	struct pl_vk_inst_params iparams = pl_vk_inst_default_params;
 	VkXcbSurfaceCreateInfoKHR xcbinfo;
 
-
-	
 	char xcbext[] = {"VK_KHR_xcb_surface"};
 	char surfext[] = {"VK_KHR_surface"};
 
 	Debug(3,"Init Placebo\n");
-	
-
-
-	
+		
 	p = calloc(1,sizeof(struct priv));
 	if (!p)
 	   Fatal(_("Cant get memory for PLACEBO struct"));
-	
-	
+		
 	// Create context
 	p->context.log_cb = &pl_log_intern;
 	p->context.log_level = PL_LOG_WARN;
@@ -5142,7 +5625,7 @@ void InitPlacebo(){
 	params.instance = p->vk_inst->instance;
 	params.async_transfer = true;
     params.async_compute = true;
-    params.queue_count = 1;
+    params.queue_count = 16;
 	params.surface = p->pSurface;
     params.allow_software = false;
 	
@@ -5151,7 +5634,17 @@ void InitPlacebo(){
 		Fatal(_("Failed to create Vulkan Device"));
 	
     p->gpu = p->vk->gpu;
-
+	
+	if (!(p->gpu->import_caps.tex & PL_HANDLE_DMA_BUF)) {
+		p->has_dma_buf = 0;
+		Debug(3,"No support for dma_buf import in Vulkan\n");
+	}
+	else {
+		p->has_dma_buf = 1;
+		Debug(3,"dma_buf support in Vulkan available\n");
+	}
+	
+#if 1
 	// Create initial swapchain
 	p->swapchain = pl_vulkan_create_swapchain(p->vk, &(struct pl_vulkan_swapchain_params) {
         .surface = p->pSurface,
@@ -5168,7 +5661,7 @@ void InitPlacebo(){
     if (!p->renderer) {
         Fatal(_("Failed initializing libplacebo renderer\n"));
     }
-		
+#endif		
 	Debug(3,"Placebo: init ok");
 	
 }
@@ -5179,9 +5672,9 @@ void InitPlacebo(){
 ///
 static void *VideoDisplayHandlerThread(void *dummy)
 {
-
+#ifdef CUVID
 	CUcontext cuda_ctx;
-
+#endif
 #ifdef PLACEBO_
 //	InitPlacebo();
 #endif
@@ -5224,33 +5717,33 @@ static void *VideoDisplayHandlerThread(void *dummy)
     pl_context_destroy(&p->ctx);
 	free(p); 
 #endif
+#ifdef CUVID
 	cuCtxDestroy (cuda_ctx);
+#endif
     return dummy;
 }
 
 #ifdef PLACEBO
 static void *VideoHandlerThread(void *dummy) {
-	CuvidDecoder *decoder;
-	int filled;
-	 
-	decoder = CuvidDecoders[0];
+
+	uint64_t first_time;
 		
 	prctl(PR_SET_NAME,"cuvid video display",0,0,0);
+	
 	for (;;) {
 	
 		pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
 		pthread_testcancel();
 		pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL);
 
-		VideoPollEvent();
-		usleep(2000);
-		filled = atomic_read(&decoder->SurfacesFilled);
-//		if (filled >= 0) {			
-			pthread_mutex_lock(&VideoLockMutex);
-			CuvidSyncDisplayFrame();
-			pthread_mutex_unlock(&VideoLockMutex);
-//		}		
+		VideoPollEvent();	
+			
+//		first_time = GetusTicks();
+		CuvidSyncDisplayFrame();		
+
+//		printf("syncdisplayframe exec %d\n",(GetusTicks()-first_time)/1000000);	
 	}
+	
 	return dummy;
 }
 #endif
@@ -5678,7 +6171,7 @@ uint8_t *VideoGrab(int *size, int *width, int *height, int write_header)
 ///
 uint8_t *VideoGrabService(int *size, int *width, int *height)
 {
-    Debug(3, "video: grab service\n");
+//    Debug(3, "video: grab service\n");
 
 #ifdef USE_GRAB
     if (VideoUsedModule->GrabOutput) {