feat: implement core structure (#8)

* feat: implement core structure * chore(bun.lockb): update lockfile * feat: download and split a video file * feat: transcribe an interview using whisper * fix: add loggings * fix(src/transcribe.ts): skip splitting if unnecessary * feat: proofread the transcribed text * style(src/ai.ts): run biome * feat: upload result files to Google Drive * feat: reply file urls to command * fix(src/main.ts): remove activity since the bot never be online * style: run biome * fix(src/ai.ts): define the max file size for Whisper API as const * fix(src/ai.ts): fix error message * fix(src/commands.ts): fix type assumption * feat(src/transcribe.ts): remove temp direcotry after completion * fix(src/transcribe.ts): log file uploads * fix(src/commands.ts): improve bot response * style(src/commands.ts): run biome * style(src/gdrive.ts): use {} instead of undefined for spread * fix(src/gdrive.ts): upload text instead of stream if the file is converted * fix(src/transcribe.ts): remove duplicated logs * fix(src/gdrive.ts): upload files with filename * fix(src/gdrive.ts): specify mimeType if converting * fix(src/gdrive.ts): remove extension if converting * fix(src/commands.ts): inline fields of command response * fix: use UUID as filename of temp files to avoid a whisper's error * fix(src/transcribe.ts): use unique-string instead of UUID for filename * style: run biome * fix(src/gdrive.ts): fix filename to upload
Larva06 · Jan 16, 2024 · e702a5c · e702a5c
1 parent e7ff0f4
commit e702a5c
Show file tree

Hide file tree

Showing 13 changed files with 947 additions and 1 deletion.
diff --git a/biome.json b/biome.json
@@ -13,7 +13,10 @@
 	},
 	"linter": {
 		"rules": {
-			"all": true
+			"all": true,
+			"nursery": {
+				"noNodejsModules": "off"
+			}
 		}
 	},
 	"json": {

diff --git a/bun.lockb b/bun.lockb
diff --git a/cspell.config.cjs b/cspell.config.cjs
@@ -21,5 +21,9 @@ module.exports = {
 		"knip",
 		"commitlint",
 		"automerge",
+		"openai",
+		"consola",
+		"gdrive",
+		"ffprobe",
 	],
 };
diff --git a/knip.config.ts b/knip.config.ts
@@ -2,10 +2,12 @@ import type { KnipConfig } from "knip";
 
 const config: KnipConfig = {
 	ignoreDependencies: [
+		"bun",
 		// @commitlint/cli cannot be detected because its binary is named "commitlint"
 		// ref: https://knip.dev/guides/handling-issues/#example
 		"@commitlint/cli",
 	],
+	ignoreBinaries: ["screen"],
 };
 
 // biome-ignore lint/style/noDefaultExport:

diff --git a/package.json b/package.json
@@ -3,6 +3,8 @@
   "name": "interview-transcriber",
   "private": true,
   "scripts": {
+    "start": "bun src/main.ts",
+    "start:screen": "screen -DRS transcriber bun start",
     "commit": "git-cz",
     "check": "npm-run-all check:*",
     "check:biome": "biome check --apply-unsafe .",
@@ -12,6 +14,17 @@
     "ignore-sync": "ignore-sync .",
     "prepare": "husky install"
   },
+  "dependencies": {
+    "@google/generative-ai": "0.1.3",
+    "@googleapis/drive": "8.5.0",
+    "consola": "3.2.3",
+    "csv-parse": "5.5.3",
+    "discord.js": "14.14.1",
+    "fluent-ffmpeg": "2.1.2",
+    "mime": "4.0.1",
+    "openai": "4.24.1",
+    "unique-string": "3.0.0"
+  },
   "devDependencies": {
     "@biomejs/biome": "1.5.2",
     "@commitlint/cli": "18.4.4",
@@ -21,6 +34,8 @@
     "@cspell/cspell-types": "8.3.2",
     "@tsconfig/bun": "1.0.1",
     "@tsconfig/strictest": "2.0.2",
+    "@types/fluent-ffmpeg": "2.1.24",
+    "@types/node": "20.10.8",
     "bun-types": "1.0.22",
     "commitizen": "4.3.0",
     "cspell": "8.3.2",

diff --git a/src/.gitkeep b/src/.gitkeep
diff --git a/src/ai.ts b/src/ai.ts
@@ -0,0 +1,115 @@
+import { createReadStream } from "node:fs";
+import { GoogleGenerativeAI } from "@google/generative-ai";
+import { env } from "bun";
+import openAi from "openai";
+import { SupportedLanguages } from "./transcribe";
+
+/**
+ * OpenAI API client.
+ */
+export const openaiClient = new openAi({
+	apiKey: env.OPENAI_API_KEY,
+});
+
+/**
+ * Maximum file size for Whisper API.
+ * @see https://platform.openai.com/docs/api-reference/speech-to-text
+ */
+export const whisperMaxFileSize = 25 * 1000 * 1000;
+
+/**
+ * Gemini API client.
+ */
+export const geminiClient = new GoogleGenerativeAI(env.GEMINI_API_KEY);
+
+/**
+ * Transcribe an audio file.
+ * @param audioFilePath Path to the audio file
+ * @param language Language of the audio file
+ * @returns Transcribed text segments
+ */
+export const transcribeAudioFile = async (
+	audioFilePath: string,
+	language: SupportedLanguages,
+): Promise<string[]> => {
+	const response = (await openaiClient.audio.transcriptions.create({
+		file: createReadStream(audioFilePath),
+		model: "whisper-1",
+		language,
+		prompt:
+			language === "en"
+				? "Hello. This is an interview, and you transcribe it."
+				: "こんにちは。これはインタビューの録音で、文字起こしをします。",
+		// biome-ignore lint/style/useNamingConvention: library's naming convention
+		response_format: "verbose_json",
+	})) as openAi.Audio.Transcriptions.Transcription & {
+		segments: {
+			text: string;
+		}[];
+	}; // cast since the library doesn't support verbose_json
+
+	return response.segments.map((segment) => segment.text);
+};
+
+/**
+ * Proofread a transcription.
+ * @param transcription Transcription to proofread
+ * @param language Language of the transcription
+ * @param model AI model to use
+ * @param prompt System prompt to use
+ * @returns Proofread transcription
+ */
+export const proofreadTranscription = async <M extends "gpt-4" | "gemini-pro">(
+	transcription: string,
+	language: SupportedLanguages,
+	model: M,
+): Promise<{ model: M; prompt: string; response: string }> => {
+	const systemPrompt = `You are a web media proofreader.
+The text ${model === "gpt-4" ? "entered by the user" : "below"} is a transcription of the interview.
+Follow the guide below and improve it.
+- Remove redundant or repeating expressions.
+- Remove fillers.
+- Correct grammar errors.
+- Replace unnatural or difficult wordings.
+- Shorten sentences.
+The output style should be the style of an interview, like \`interviewer: \` or \`interviewee\`.
+${
+	language === "en"
+		? "The response must not include markdown syntax."
+		: "The response must be in Japanese without markdown syntax."
+}`;
+
+	let result = "";
+	if (model === "gpt-4") {
+		const response = await openaiClient.chat.completions.create({
+			messages: [
+				{
+					role: "system",
+					content: systemPrompt,
+				},
+				{
+					role: "user",
+					content: transcription,
+				},
+			],
+			model,
+		});
+		result = response.choices[0]?.message.content ?? "";
+	} else {
+		const response = await geminiClient
+			.getGenerativeModel({
+				model,
+			})
+			.generateContent(`${systemPrompt}\n\n---\n\n${transcription}`);
+		result = response.response.text();
+	}
+	if (!result) {
+		throw new Error("The response is empty.");
+	}
+
+	return {
+		model,
+		prompt: systemPrompt,
+		response: result,
+	};
+};