import type {
  AudioFormat,
  AudioFrame,
  VoiceProvider,
  VoiceProviderConnectInput,
  VoiceProviderSession,
} from "./types.ts";
import {
  audioFrameToBase64,
  decodeBase64,
  RealtimeWebSocketSession,
  type ParsedRealtimeMessage,
  type WebSocketFactory,
} from "./realtime-websocket.ts";

export interface OpenAiRealtimeProviderOptions {
  apiKey: string;
  model: string;
  realtimeUrl?: string;
  defaultVoice?: string;
  sessionShape?: "ga" | "legacy";
  webSocketFactory?: WebSocketFactory;
}

export class OpenAiRealtimeProvider implements VoiceProvider {
  readonly kind = "openai_realtime" as const;

  constructor(private readonly opts: OpenAiRealtimeProviderOptions) {}

  async connect(input: VoiceProviderConnectInput): Promise<VoiceProviderSession> {
    const url = new URL(this.opts.realtimeUrl ?? "wss://api.openai.com/v1/realtime");
    url.searchParams.set("model", this.opts.model);

    const headers: Record<string, string> = {
      Authorization: `Bearer ${this.opts.apiKey}`,
    };
    if (input.safetyIdentifier) {
      headers["OpenAI-Safety-Identifier"] = input.safetyIdentifier;
    }

    return new RealtimeWebSocketSession({
      provider: this.kind,
      sessionId: input.callId,
      url: url.toString(),
      headers,
      events: input.events,
      webSocketFactory: this.opts.webSocketFactory,
      makeSessionUpdate: () => this.makeSessionUpdate(input),
      makeAppendAudio: (frame) => ({
        type: "input_audio_buffer.append",
        audio: audioFrameToBase64(frame),
      }),
      makeTextMessage: (text) => [
        {
          type: "conversation.item.create",
          item: {
            type: "message",
            role: "user",
            content: [{ type: "input_text", text }],
          },
        },
        { type: "response.create" },
      ],
      makeToolResult: (toolCallId, result) => [
        {
          type: "conversation.item.create",
          item: {
            type: "function_call_output",
            call_id: toolCallId,
            output: JSON.stringify(result),
          },
        },
        { type: "response.create" },
      ],
      makeInterrupt: () => ({ type: "response.cancel" }),
      parseMessage: (raw) => parseOpenAiRealtimeMessage(raw, input.outputFormat),
    });
  }

  private makeSessionUpdate(input: VoiceProviderConnectInput): Record<string, unknown> {
    const tools = input.tools.map((tool) => ({
      type: "function",
      name: tool.name,
      description: tool.description,
      parameters: tool.parameters,
    }));

    if (this.opts.sessionShape === "legacy") {
      return {
        type: "session.update",
        session: {
          instructions: input.instructions,
          voice: input.voice ?? this.opts.defaultVoice ?? "alloy",
          input_audio_format: "pcm16",
          output_audio_format: "pcm16",
          turn_detection: { type: "server_vad" },
          tools,
          tool_choice: "auto",
        },
      };
    }

    return {
      type: "session.update",
      session: {
        type: "realtime",
        model: this.opts.model,
        instructions: input.instructions,
        output_modalities: ["audio"],
        audio: {
          input: {
            format: {
              type: "audio/pcm",
              rate: input.inputFormat.sampleRateHz,
            },
            turn_detection: { type: "server_vad" },
          },
          output: {
            format: {
              type: "audio/pcm",
              rate: input.outputFormat.sampleRateHz,
            },
            voice: input.voice ?? this.opts.defaultVoice ?? "alloy",
          },
        },
        tools,
        tool_choice: "auto",
      },
    };
  }
}

/**
 * Pass the session's negotiated output format when parsing provider audio.
 * The default exists for legacy helper callers only.
 */
export function parseOpenAiRealtimeMessage(
  raw: unknown,
  outputFormat: AudioFormat = { codec: "pcm16", sampleRateHz: 16000, channels: 1 },
): ParsedRealtimeMessage[] {
  if (!raw || typeof raw !== "object") return [];
  const event = raw as Record<string, unknown>;
  const type = String(event.type ?? "");

  if (
    (type === "response.audio.delta" || type === "response.output_audio.delta") &&
    typeof event.delta === "string"
  ) {
    return [
      {
        type: "audio",
        frame: pcm16FrameFromBase64(event.delta, outputFormat),
      },
    ];
  }

  if (
    (type === "response.audio_transcript.delta" ||
      type === "response.audio_transcript.done" ||
      type === "response.output_audio_transcript.delta" ||
      type === "response.output_audio_transcript.done") &&
    (typeof event.delta === "string" || typeof event.transcript === "string")
  ) {
    return [
      {
        type: "transcript",
        role: "agent",
        text: String(event.delta ?? event.transcript),
        isFinal: type.endsWith(".done"),
      },
    ];
  }

  if (
    type === "conversation.item.input_audio_transcription.completed" &&
    typeof event.transcript === "string"
  ) {
    return [
      {
        type: "transcript",
        role: "caller",
        text: event.transcript,
        isFinal: true,
      },
    ];
  }

  if (
    type === "response.function_call_arguments.done" &&
    typeof event.name === "string" &&
    typeof event.call_id === "string"
  ) {
    return [
      {
        type: "tool_call",
        call: {
          id: event.call_id,
          name: event.name,
          arguments: parseJsonObject(event.arguments),
        },
      },
    ];
  }

  if (type === "error") {
    const message =
      typeof event.error === "object" && event.error
        ? JSON.stringify(event.error)
        : "OpenAI Realtime error";
    return [{ type: "error", error: new Error(message) }];
  }

  return [];
}

function parseJsonObject(value: unknown): Record<string, unknown> {
  if (typeof value !== "string") return {};
  try {
    const parsed = JSON.parse(value) as unknown;
    return parsed && typeof parsed === "object" && !Array.isArray(parsed)
      ? (parsed as Record<string, unknown>)
      : {};
  } catch {
    return {};
  }
}

function pcm16FrameFromBase64(value: string, outputFormat: AudioFormat): AudioFrame {
  return {
    data: decodeBase64(value),
    format: outputFormat,
  };
}
