feat(audio-conversion): add controller for Google Speech-to-Text API

master
adriano 2023-11-03 09:02:59 -03:00
parent 0ae216e9bc
commit 5673c86505
16 changed files with 1552 additions and 70 deletions

25
app.js
View File

@ -10,21 +10,18 @@ const morgan = require('morgan')
// const fileUpload = require('express-fileupload') // const fileUpload = require('express-fileupload')
const rateLimiter = require('express-rate-limit') const rateLimiter = require('express-rate-limit')
const helmet = require('helmet')
const xss = require('xss-clean')
const cors = require('cors')
// Swagger // Swagger
const swaggerUI = require('swagger-ui-express') const swaggerUI = require('swagger-ui-express')
const YAML = require('yamljs') const YAML = require('yamljs')
const swaggerDocument = YAML.load('./swagger.yaml') const swaggerDocument = YAML.load('./swagger.yaml')
// database
const connectDB = require('./db/connect')
const helmet = require('helmet')
const xss = require('xss-clean')
const cors = require('cors')
// routers // routers
const nlRouter = require('./routes/naturalLanguageRoute') const nlRouter = require('./routes/naturalLanguageRoute')
const notFoundMiddlware = require('./middleware/not-found') const notFoundMiddlware = require('./middleware/not-found')
const errorHandlerMiddleware = require('./middleware/error-handler') const errorHandlerMiddleware = require('./middleware/error-handler')
@ -35,7 +32,7 @@ app.use(rateLimiter({
windowMs: 15 * 60 * 1000, windowMs: 15 * 60 * 1000,
max: 60, max: 60,
})) }))
// Security packages // Security packages
app.use(helmet()) app.use(helmet())
app.use(cors()) app.use(cors())
@ -43,17 +40,15 @@ app.use(xss())
app.use(morgan('tiny')) app.use(morgan('tiny'))
app.use(express.json()) app.use(express.json())
// app.use(express.static('./public'))
// app.use(fileUpload()) // app.use(fileUpload())
app.get('/', (req, res) => { app.get('/', (req, res) => {
res.send('<h1>Sentiment API</h1><a href="/api-docs">Documentation</a>') res.send('<h1>Sentiment API</h1><a href="/api-docs">Documentation</a>')
}) })
app.use('/api-docs', swaggerUI.serve, swaggerUI.setup(swaggerDocument)) app.use('/api-docs', swaggerUI.serve, swaggerUI.setup(swaggerDocument))
app.use('/api/v1/nl', nlRouter) app.use('/api/v1/nl', nlRouter)
app.use(notFoundMiddlware) app.use(notFoundMiddlware)

View File

@ -1,8 +1,16 @@
const { StatusCodes } = require("http-status-codes") const { StatusCodes } = require("http-status-codes")
const { sentiment, convertTextToSpeech, listVoice } = require("../utils") const { sentiment, convertTextToSpeech, listVoice, convertAudioToLinear16, getAudioDuration } = require("../utils")
const language = require('@google-cloud/language').v2 const language = require('@google-cloud/language').v2
const CustomError = require('../errors') const CustomError = require('../errors')
const voiceConfigList = require('../mockData/voice.json') const voiceConfigList = require('../mockData/voice.json')
const languageCodes = require('../mockData/languageCodes.json')
const path = require('path')
const fs = require('fs')
const speech = require('@google-cloud/speech')
const { speechToText, speechToTextJob } = require('../utils')
const client = new speech.SpeechClient()
const protobuf = require('protobufjs')
const getSentiment = async (req, res) => { const getSentiment = async (req, res) => {
@ -16,7 +24,7 @@ const getSentiment = async (req, res) => {
const getAudioFromText = async (req, res) => { const getAudioFromText = async (req, res) => {
const { text, voice_name, voice_gender, languageCode } = req.query const { text, voice_name, voice_gender, languageCode } = req.query
if ((voice_name || voice_gender || languageCode) && languageCode == 'pt-BR') { if ((voice_name || voice_gender || languageCode) && languageCode == 'pt-BR') {
const config = { voice_name, voice_gender, languageCode } const config = { voice_name, voice_gender, languageCode }
@ -39,18 +47,16 @@ const getAudioFromText = async (req, res) => {
const audioBuffer = await convertTextToSpeech(text, voice_name, voice_gender, languageCode) const audioBuffer = await convertTextToSpeech(text, voice_name, voice_gender, languageCode)
if (voice_name && voice_gender && languageCode){ if (voice_name && voice_gender && languageCode) {
filename = `${voice_name}_${voice_gender}_${languageCode}.mp3` filename = `${voice_name}_${voice_gender}_${languageCode}.mp3`
} }
else{ else {
filename = `pt-BR-Standard-B_MALE_pt-BR.mp3` filename = `pt-BR-Standard-B_MALE_pt-BR.mp3`
} }
// Set the Content-Disposition header // Set the Content-Disposition header
// res.set("Content-Disposition", `attachment; filename="${filename}"`); // res.set("Content-Disposition", `attachment; filename="${filename}"`);
res.set("Content-Disposition", `inline; filename="${filename}"`); res.set("Content-Disposition", `inline; filename="${filename}"`)
res.contentType('audio/mpeg') res.contentType('audio/mpeg')
@ -58,6 +64,134 @@ const getAudioFromText = async (req, res) => {
} }
const getTextFromAudio = async (req, res) => {
const { languageCode } = req.body
const audio = req.file
if (!audio)
throw new CustomError.BadRequestError(`Missing the audio file`)
if (languageCode) {
const existLanguageCode = languageCodes.find(l => l.languageCode == languageCode)
if (!existLanguageCode) {
fs.unlinkSync(audio.path)
throw new CustomError.BadRequestError(`Invalid language code`)
}
}
const inputFile = path.resolve(audio.path)
const fileName = path.basename(inputFile, path.extname(inputFile))
const outputFile = path.join(__dirname, '..', 'public', 'uploads', `${fileName}.wav`)
const filePath = await convertAudioToLinear16(inputFile, outputFile)
fs.unlinkSync(inputFile)
const obj = await speechToText(filePath, languageCode)
fs.unlinkSync(filePath)
if (obj?.transcription) return res.status(StatusCodes.OK).json({ transcription: obj.transcription })
res.status(obj.status).json({ msg: obj.msg })
}
const uploadAudioToTranscript = async (req, res) => {
const { languageCode } = req.body
const audio = req.file
if (!audio)
throw new CustomError.BadRequestError(`Missing the audio file`)
if (languageCode) {
const existLanguageCode = languageCodes.find(l => l.languageCode == languageCode)
if (!existLanguageCode) {
fs.unlinkSync(audio.path)
throw new CustomError.BadRequestError(`Invalid language code`)
}
}
const inputFile = path.resolve(audio.path)
const fileName = path.basename(inputFile, path.extname(inputFile))
const outputFile = path.join(__dirname, '..', 'public', 'uploads', `${fileName}.wav`)
const filePath = await convertAudioToLinear16(inputFile, outputFile)
fs.unlinkSync(inputFile)
const obj = await speechToTextJob(filePath, languageCode)
fs.unlinkSync(filePath)
if (obj?.operationName) return res.status(StatusCodes.OK).json({ operationId: obj.operationName })
res.status(obj.status).json({ msg: obj.msg })
}
const getJobStatus = async (req, res) => {
const { operationName } = req.query
if (!operationName)
throw new CustomError.BadRequestError(`Missing operationName query parameter`)
// Get the operation using the operationName
const [response] = await client.getOperation({ name: operationName })
if (!response) {
return res.status(404).json({ msg: "Operation not found" })
}
if (response.done) {
// Load the protobuf message types
const root = new protobuf.Root()
root.loadSync(path.join(__dirname, '..', 'node_modules', 'google-proto-files', 'google', 'rpc', 'status.proto'), { keepCase: true })
root.loadSync(path.join(__dirname, '..', 'node_modules', 'google-proto-files', 'google', 'protobuf', 'duration.proto'), { keepCase: true })
root.loadSync(path.join(__dirname, '..', 'node_modules', 'google-proto-files', 'google', 'cloud', 'speech', 'v1', 'cloud_speech.proto'), { keepCase: true })
// Get the message type
const LongRunningRecognizeResponse = root.lookupType('google.cloud.speech.v1.LongRunningRecognizeResponse')
if (!response) {
return res.status(StatusCodes.NOT_FOUND).json({ msg: "Operation not found" })
}
// Decode the response value to get transcribed text
const longRunningResponse = LongRunningRecognizeResponse.decode(response.response.value)
if (longRunningResponse.error) {
console.error('Error:', longRunningResponse.error)
res.status(StatusCodes.INTERNAL_SERVER_ERROR).json({ msg: longRunningResponse.error })
} else {
const transcriptions = longRunningResponse.results.map(result => result.alternatives[0].transcript)
const fullTranscription = transcriptions.join(' ')
// console.log('Full Transcription:', fullTranscription)
res.status(StatusCodes.OK).json({ transcription: fullTranscription })
}
} else {
res.status(StatusCodes.ACCEPTED).json({ msg: "Transcription in progress" })
}
}
const getVoiceConfig = async (req, res) => { const getVoiceConfig = async (req, res) => {
const { languageCode } = req.query const { languageCode } = req.query
@ -72,5 +206,8 @@ const getVoiceConfig = async (req, res) => {
module.exports = { module.exports = {
getSentiment, getSentiment,
getAudioFromText, getAudioFromText,
getVoiceConfig getTextFromAudio,
getVoiceConfig,
getJobStatus,
uploadAudioToTranscript
} }

View File

@ -0,0 +1,59 @@
[
{ "languageCode": "af-ZA" },
{ "languageCode": "ar-XA" },
{ "languageCode": "bg-BG" },
{ "languageCode": "bn-IN" },
{ "languageCode": "ca-ES" },
{ "languageCode": "cmn-CN" },
{ "languageCode": "cmn-TW" },
{ "languageCode": "cs-CZ" },
{ "languageCode": "da-DK" },
{ "languageCode": "de-DE" },
{ "languageCode": "el-GR" },
{ "languageCode": "en-AU" },
{ "languageCode": "en-GB" },
{ "languageCode": "en-IN" },
{ "languageCode": "en-US" },
{ "languageCode": "es-ES" },
{ "languageCode": "es-US" },
{ "languageCode": "eu-ES" },
{ "languageCode": "fi-FI" },
{ "languageCode": "fil-PH" },
{ "languageCode": "fr-CA" },
{ "languageCode": "fr-FR" },
{ "languageCode": "gl-ES" },
{ "languageCode": "gu-IN" },
{ "languageCode": "he-IL" },
{ "languageCode": "hi-IN" },
{ "languageCode": "hu-HU" },
{ "languageCode": "id-ID" },
{ "languageCode": "is-IS" },
{ "languageCode": "it-IT" },
{ "languageCode": "ja-JP" },
{ "languageCode": "kn-IN" },
{ "languageCode": "ko-KR" },
{ "languageCode": "lt-LT" },
{ "languageCode": "lv-LV" },
{ "languageCode": "ml-IN" },
{ "languageCode": "mr-IN" },
{ "languageCode": "ms-MY" },
{ "languageCode": "nb-NO" },
{ "languageCode": "nl-BE" },
{ "languageCode": "nl-NL" },
{ "languageCode": "pa-IN" },
{ "languageCode": "pl-PL" },
{ "languageCode": "pt-BR" },
{ "languageCode": "pt-PT" },
{ "languageCode": "ro-RO" },
{ "languageCode": "ru-RU" },
{ "languageCode": "sk-SK" },
{ "languageCode": "sr-RS" },
{ "languageCode": "sv-SE" },
{ "languageCode": "ta-IN" },
{ "languageCode": "te-IN" },
{ "languageCode": "th-TH" },
{ "languageCode": "tr-TR" },
{ "languageCode": "uk-UA" },
{ "languageCode": "vi-VN" },
{ "languageCode": "yue-HK" }
]

1120
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -11,6 +11,8 @@
"license": "ISC", "license": "ISC",
"dependencies": { "dependencies": {
"@google-cloud/language": "^6.1.0", "@google-cloud/language": "^6.1.0",
"@google-cloud/speech": "^6.0.2",
"@google-cloud/storage": "^7.4.0",
"@google-cloud/text-to-speech": "^5.0.1", "@google-cloud/text-to-speech": "^5.0.1",
"bcryptjs": "^2.4.3", "bcryptjs": "^2.4.3",
"cookie-parser": "^1.4.5", "cookie-parser": "^1.4.5",
@ -21,14 +23,20 @@
"express-fileupload": "^1.2.1", "express-fileupload": "^1.2.1",
"express-mongo-sanitize": "^2.1.0", "express-mongo-sanitize": "^2.1.0",
"express-rate-limit": "^5.4.1", "express-rate-limit": "^5.4.1",
"fluent-ffmpeg": "^2.1.2",
"google-gax": "^4.0.5",
"google-proto-files": "^4.0.0",
"google-protobuf": "^3.21.2",
"helmet": "^4.6.0", "helmet": "^4.6.0",
"http-status-codes": "^2.1.4", "http-status-codes": "^2.1.4",
"joi": "^17.4.0", "joi": "^17.4.0",
"mongoose": "^7.3.1", "mongoose": "^7.3.1",
"morgan": "^1.10.0", "morgan": "^1.10.0",
"multer": "^1.4.5-lts.1",
"protobufjs": "^7.2.5",
"swagger-ui-express": "^4.1.6",
"validator": "^13.6.0", "validator": "^13.6.0",
"xss-clean": "^0.1.1", "xss-clean": "^0.1.1",
"swagger-ui-express": "^4.1.6",
"yamljs": "^0.3.0" "yamljs": "^0.3.0"
}, },
"devDependencies": { "devDependencies": {

Binary file not shown.

Before

Width:  |  Height:  |  Size: 127 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 205 KiB

View File

@ -1,11 +1,14 @@
const express = require('express') const express = require('express')
const router = express.Router() const router = express.Router()
const { authorization, } = require('../middleware/authentication') const { authorization, } = require('../middleware/authentication')
const { audioUpload } = require("../utils")
const { getSentiment, getAudioFromText, getVoiceConfig } = require('../controllers/naturalLanguageController') const { getSentiment, getAudioFromText, getTextFromAudio, getVoiceConfig, uploadAudioToTranscript, getJobStatus } = require('../controllers/naturalLanguageController')
router.route('/sentiment').post(authorization, getSentiment) router.route('/sentiment').post(authorization, getSentiment)
router.route('/text-to-speech').get(getAudioFromText) router.route('/text-to-speech').get(getAudioFromText)
router.route('/speech-to-text').post(audioUpload.single('audio'), getTextFromAudio)
router.route('/upload-audio-to-transcript').post(audioUpload.single('audio'), uploadAudioToTranscript)
router.route('/query-job-status').get(getJobStatus)
router.route('/voice-config').get(getVoiceConfig) router.route('/voice-config').get(getVoiceConfig)
module.exports = router module.exports = router

View File

@ -2,6 +2,7 @@ openapi: 3.0.0
info: info:
title: Natural Language API title: Natural Language API
contact: {} contact: {}
description: This API describes the endpoints and parameters to use resources from google cloud api.
version: '1.0' version: '1.0'
servers: servers:
- url: http://localhost:6001/api/v1/nl/ - url: http://localhost:6001/api/v1/nl/

View File

@ -0,0 +1,25 @@
const multer = require('multer')
const path = require('path')
//Destination to store the file
const audioStorage = multer.diskStorage({
destination: function (req, file, cb) {
cb(null, `public/uploads`)
},
filename: function (req, file, cb) {
cb(null, Date.now() + String(Math.floor(Math.random() * 1000)) + path.extname(file.originalname))
}
})
const audioUpload = multer({
storage: audioStorage,
fileFilter(req, file, cb) {
if (!file.originalname.match(/\.(mp3|wav|ogg|flac|aac|wma|m4a|mp4|webm|opus|mpeg)$/i)) {
return cb(new Error('Invalid file type. Send only an audio file!'))
}
cb(undefined, true)
}
})
module.exports = audioUpload

View File

@ -0,0 +1,35 @@
// Imports the Google Cloud client library
const { Storage } = require('@google-cloud/storage')
async function audioUploadToBucket(
bucketName,
filePath,
destFileName,
) {
// [START storage_upload_file]
// Creates a client
const storage = new Storage()
async function uploadFile() {
const options = {
destination: destFileName,
}
await storage.bucket(bucketName).upload(filePath, options)
console.log(`${filePath} uploaded to ${bucketName}`)
}
try {
await uploadFile()
return true
} catch (error) {
console.error(error)
return false
}
// [END storage_upload_file]
}
module.exports = audioUploadToBucket

View File

@ -0,0 +1,16 @@
const ffmpeg = require('fluent-ffmpeg')
async function convertToLINEAR16(inputFile, outputFile) {
return new Promise((resolve, reject) => {
ffmpeg(inputFile)
.audioCodec('pcm_s16le') // Set the audio codec to LINEAR16
.audioFrequency(16000) // Set the sample rate to 16,000 Hz
.audioChannels(1)
.on('end', () => resolve(outputFile))
.on('error', (err) => reject(err))
.save(outputFile)
})
}
module.exports = convertToLINEAR16

View File

@ -0,0 +1,17 @@
const ffmpeg = require('fluent-ffmpeg')
async function getAudioDuration(filePath) {
return new Promise((resolve, reject) => {
ffmpeg.ffprobe(filePath, (err, metadata) => {
if (err) {
reject(err)
} else {
resolve(Math.round(metadata.format.duration))
}
})
})
}
module.exports = getAudioDuration

View File

@ -2,13 +2,24 @@
// const createTokenUser = require('./createTokenUser') // const createTokenUser = require('./createTokenUser')
// const checkPermissions = require('./checkPermissions') // const checkPermissions = require('./checkPermissions')
const sentiment = require('./sentiment') const sentiment = require('./sentiment')
const convertTextToSpeech = require('./textToSpeech') const convertTextToSpeech = require('./textToSpeech')
const listVoice = require('./listVoice') const listVoice = require('./listVoice')
const convertAudioToLinear16 = require('./convertAudioToLinear16')
const getAudioDuration = require('./getAudioDuration')
const audioUploadToBucket = require('./audioUploadToBucket')
const audioUpload = require('./audioUpload')
const speechToText = require('./speechToText')
const speechToTextJob = require('./speechToTextJob')
module.exports = { module.exports = {
sentiment, sentiment,
convertTextToSpeech, convertTextToSpeech,
listVoice listVoice,
convertAudioToLinear16,
getAudioDuration,
audioUploadToBucket,
audioUpload,
speechToText,
speechToTextJob
} }

View File

@ -0,0 +1,67 @@
// Imports the Google Cloud client library
const speech = require('@google-cloud/speech')
const { StatusCodes } = require("http-status-codes")
const path = require('path')
const fs = require('fs')
const getAudioDuration = require('./getAudioDuration')
const audioUploadToBucket = require('./audioUploadToBucket')
async function speechToText(filename, languageCode = 'pt-Br', bucket = 'speect-to-text-bucket', sampleRateHertz = 16000, encoding = 'LINEAR16') {
const client = new speech.SpeechClient()
let audio
const config = {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
}
const seconds = await getAudioDuration(filename)
if (seconds >= 28800) {
return { msg: 'Audio file is higher than 480 minute', status: StatusCodes.BAD_REQUEST }
}
else if (seconds <= 59) {
audio = {
content: fs.readFileSync(filename).toString('base64'),
}
} else if (seconds >= 60) {
const uploaded = await audioUploadToBucket(bucket, filename, path.basename(filename))
if (uploaded) {
audio = {
uri: `gs://${bucket}/${path.basename(filename)}`,
}
}
}
if (!audio) return { msg: `Error on try upload the file to google cloud bucket(${bucket}) storage`, status: StatusCodes.INTERNAL_SERVER_ERROR }
const request = {
config: config,
audio: audio,
}
try {
// Detects speech in the audio file. This creates a recognition job that you
// can wait for now, or get its result later.
const [operation] = await client.longRunningRecognize(request)
// Get a Promise representation of the final result of the job
const [response] = await operation.promise()
const transcription = response.results
.map(result => result.alternatives[0].transcript)
.join('\n')
console.log(`Transcription: ${transcription}`)
return { msg: `Transcript success`, status: StatusCodes.OK, transcription }
} catch (error) {
console.log('ERROR ON TRY TRANSCRIPT: ', error)
return { msg: `Error on try transcript the file`, status: StatusCodes.INTERNAL_SERVER_ERROR }
}
}
module.exports = speechToText

View File

@ -0,0 +1,64 @@
// Imports the Google Cloud client library
const speech = require('@google-cloud/speech')
const path = require('path')
const fs = require('fs')
const getAudioDuration = require('./getAudioDuration')
const audioUploadToBucket = require('./audioUploadToBucket')
const { StatusCodes } = require("http-status-codes")
async function speechToTextJob(filename, languageCode = 'pt-Br', bucket = 'speect-to-text-bucket', sampleRateHertz = 16000, encoding = 'LINEAR16') {
const client = new speech.SpeechClient()
let audio
const config = {
encoding: encoding,
sampleRateHertz: sampleRateHertz,
languageCode: languageCode,
}
const seconds = await getAudioDuration(filename)
if (seconds >= 28800) {
return { msg: 'Audio file is higher than 480 minute', status: StatusCodes.BAD_REQUEST }
}
else if (seconds <= 59) {
audio = {
content: fs.readFileSync(filename).toString('base64'),
}
} else if (seconds >= 60) {
const uploaded = await audioUploadToBucket(bucket, filename, path.basename(filename))
if (uploaded) {
audio = {
uri: `gs://${bucket}/${path.basename(filename)}`,
}
}
}
if (!audio) return { msg: `Error on try upload the file to google cloud bucket(${bucket}) storage`, status: StatusCodes.INTERNAL_SERVER_ERROR }
const request = {
config: config,
audio: audio,
}
try {
// Detects speech in the audio file. This creates a recognition job that you
// can wait for now, or get its result later.
const [operation] = await client.longRunningRecognize(request)
console.log('===========> operationName: ', operation.name)
return { msg: `success`, status: StatusCodes.OK, operationName: operation.name }
} catch (error) {
console.log('ERROR ON TRY TRANSCRIPT: ', error)
return { msg: `Error on try transcript the file`, status: StatusCodes.INTERNAL_SERVER_ERROR }
}
}
module.exports = speechToTextJob