import React, { useEffect, useRef, useState } from 'react';
import { getAzureSpeechToken } from 'api/speechToText.js';
import { SpeechConfig, AudioConfig, SpeechRecognizer } from 'microsoft-cognitiveservices-speech-sdk';
import styled, { css } from 'styled-components';
import { v4 } from 'uuid';
import translate from 'i18n-translations/translate.jsx';
import { isMobile } from 'react-device-detect';

const StyledSpeechToText = styled.div`
	position: absolute;
	padding: 10px;
	border-radius: 6px;
	font-size: 24px;
	color: white;
	background: black;
	opacity: 0.75;
	left: 12%;
	max-width: 1400px;
	bottom: 12%;
	z-index: 99;
	display: ${props => (props.isHidden ? 'none' : 'flex')};

	${props =>
		props.isMobile &&
		css`
			@media (max-width: 480px) {
				font-size: 18px;
				max-width: 300px;
			}
		`}
`;

/**
 * @param {object} props
 * @param {MediaStreamTrack} props.track
 * @param {import('react').Dispatch<import('react').SetStateAction<boolean>>} props.setShowClosedCaptions
 * @param {(data: {id: string; message: JSX.Element}[]) => void} props.setClosedCaptionsError
 */
const SpeechToTextOverlay = ({ track = null, setShowClosedCaptions, setClosedCaptionsError }) => {
	const [liveTranscription, setLiveTranscription] = useState('');
	/** @type {import('react').MutableRefObject<SpeechRecognizer>} */
	const speechRecognizer = useRef(null);
	const streamRef = useRef(null);
	const speechConfig = useRef(null);
	const liveTranscriptTimeoutRef = useRef(null);

	const maxWords = isMobile ? 12 : 40;

	useEffect(() => {
		const fetchToken = async () => {
			const response = await getAzureSpeechToken();
			const tokenResponse = await getTokenOrRefresh(response);
			speechConfig.current = SpeechConfig.fromAuthorizationToken(tokenResponse.authToken, 'westeurope');
			speechConfig.current.speechRecognitionLanguage = 'en-US';

			const tt = new MediaStream([track]);
			const audioContext = new AudioContext();
			const audioIn = audioContext.createMediaStreamSource(tt);
			const dest = audioContext.createMediaStreamDestination();
			audioIn.connect(dest);
			streamRef.current = { ctx: audioContext, dst: dest, src: audioIn };
			const audioConfig = AudioConfig.fromStreamInput(streamRef.current.dst.stream);
			speechRecognizer.current = new SpeechRecognizer(speechConfig.current, audioConfig);
			speechRecognizer.current.recognizing = (sender, event) => {
				const currentWords = event.result.text.split(' ');
				if (!isMobile) {
					if (currentWords.length > maxWords) {
						currentWords.splice(0, maxWords);
						const newLiveTranscription = currentWords.join(' ');
						setTimeout(() => {
							setLiveTranscription(newLiveTranscription);
						}, 1500);
					} else if (currentWords.length < maxWords) {
						setTimeout(() => {
							setLiveTranscription(event.result.text);
						}, 1000);
					} else {
						setLiveTranscription(event.result.text);
					}
				} else {
					let words = [];
					event.result.text.split(' ').forEach(word => {
						if (words.length < maxWords) {
							words.push(word);
						} else {
							words.splice(0, maxWords);
							words.push(word);
						}
					});
					setTimeout(() => {
						setLiveTranscription(words.join(' '));
					}, 1500);
				}
			};

			speechRecognizer.current.speechEndDetected = (sender, event) => {
				liveTranscriptTimeoutRef.current = setTimeout(() => {
					setLiveTranscription('');
				}, 3000);
			};

			speechRecognizer.current.canceled = (sender, event) => {
				setLiveTranscription('');
				setShowClosedCaptions(false);
				setClosedCaptionsError([{ id: v4(), message: translate('failedToGetCaptions') }]);
			};

			speechRecognizer.current.startContinuousRecognitionAsync();
		};

		fetchToken();

		return () => {
			setLiveTranscription('');
			clearTimeout(liveTranscriptTimeoutRef.current);
			speechRecognizer.current?.stopContinuousRecognitionAsync();
		};
	}, []);

	useEffect(() => {
		if (liveTranscription) {
			liveTranscriptTimeoutRef.current = setTimeout(() => {
				setLiveTranscription('');
			}, 3000);
		}

		return () => {
			clearTimeout(liveTranscriptTimeoutRef.current);
		};
	}, [liveTranscription]);

	useEffect(() => {
		if (speechRecognizer.current) {
			streamRef.current.src?.disconnect(streamRef.current.dst);
			if (track) {
				const tt = new MediaStream([track]);
				const audioIn = streamRef.current.ctx.createMediaStreamSource(tt);
				audioIn.connect(streamRef.current.dst);
				streamRef.current.src = audioIn;
			} else {
				streamRef.current.src = null;
			}
		}
	}, [track]);

	return (
		<StyledSpeechToText isHidden={!liveTranscription} isMobile={isMobile}>
			{liveTranscription}
		</StyledSpeechToText>
	);
};

const getTokenOrRefresh = async tokenResponse => {
	try {
		const token = tokenResponse.data;
		const region = 'westeurope';
		return { authToken: token, region: region };
	} catch (err) {
		return { authToken: null, error: err.response.data };
	}
};

export default SpeechToTextOverlay;
