feat: transcribe audio in background script

Chrome 73 no longer allows bypassing CORS from content sripts, speech service APIs are now called from the background script. The origin of background requests is removed for privacy. Closes #81.
5 years ago · 2c899261d6
parent 43f9ce53bc
commit 2c899261d6
3 changed files with 327 additions and 278 deletions
--- a/src/background/main.js
+++ b/src/background/main.js
@ -1,4 +1,5 @@
 import browser from 'webextension-polyfill';
+import audioBufferToWav from 'audiobuffer-to-wav';

 import {initStorage} from 'storage/init';
 import storage from 'storage/storage';
@ -13,9 +14,18 @@ import {
  scriptsAllowed,
  functionInContext,
  getBrowser,
-  getPlatform
+  getPlatform,
+  arrayBufferToBase64
 } from 'utils/common';
-import {clientAppVersion} from 'utils/config';
+import {
+  captchaGoogleSpeechApiLangCodes,
+  captchaIbmSpeechApiLangCodes,
+  captchaMicrosoftSpeechApiLangCodes,
+  captchaWitSpeechApiLangCodes,
+  ibmSpeechApiUrls,
+  microsoftSpeechApiUrls
+} from 'utils/data';
+import {clientAppVersion, witApiKeys} from 'utils/config';

 let nativePort;

@ -132,6 +142,305 @@ async function setChallengeLocale() {
  }
 }

+function removeRequestOrigin(details) {
+  const origin = window.location.origin;
+  const headers = details.requestHeaders;
+  for (const header of headers) {
+    if (header.name.toLowerCase() === 'origin' && header.value === origin) {
+      headers.splice(headers.indexOf(header), 1);
+      break;
+    }
+  }
+
+  return {requestHeaders: headers};
+}
+
+function addBackgroundRequestLitener() {
+  if (
+    !browser.webRequest.onBeforeSendHeaders.hasListener(removeRequestOrigin)
+  ) {
+    const urls = [
+      'https://www.google.com/*',
+      'https://api.wit.ai/*',
+      'https://speech.googleapis.com/*',
+      'https://stream-fra.watsonplatform.net/*',
+      'https://stream.watsonplatform.net/*',
+      'https://gateway-wdc.watsonplatform.net/*',
+      'https://gateway-syd.watsonplatform.net/*',
+      'https://gateway-tok.watsonplatform.net/*',
+      'https://eastus.stt.speech.microsoft.com/*',
+      'https://westus.stt.speech.microsoft.com/*',
+      'https://westus2.stt.speech.microsoft.com/*',
+      'https://eastasia.stt.speech.microsoft.com/*',
+      'https://southeastasia.stt.speech.microsoft.com/*',
+      'https://westeurope.stt.speech.microsoft.com/*',
+      'https://northeurope.stt.speech.microsoft.com/*'
+    ];
+
+    browser.webRequest.onBeforeSendHeaders.addListener(
+      removeRequestOrigin,
+      {
+        urls,
+        types: ['xmlhttprequest']
+      },
+      ['blocking', 'requestHeaders']
+    );
+  }
+}
+
+function removeBackgroundRequestLitener() {
+  if (browser.webRequest.onBeforeSendHeaders.hasListener(removeRequestOrigin)) {
+    browser.webRequest.onBeforeSendHeaders.removeListener(removeRequestOrigin);
+  }
+}
+
+async function prepareAudio(audio) {
+  const ctx = new AudioContext();
+  const data = await ctx.decodeAudioData(audio);
+  await ctx.close();
+
+  const offlineCtx = new OfflineAudioContext(
+    // force mono output
+    1,
+    16000 * data.duration,
+    16000
+  );
+  const source = offlineCtx.createBufferSource();
+  source.buffer = data;
+  source.connect(offlineCtx.destination);
+  // discard 1.5 second noise from beginning/end
+  source.start(0, 1.5, data.duration - 3);
+
+  return audioBufferToWav(await offlineCtx.startRendering());
+}
+
+async function getWitSpeechApiKey(speechService, language) {
+  if (speechService === 'witSpeechApiDemo') {
+    return witApiKeys[language];
+  } else {
+    const {witSpeechApiKeys: apiKeys} = await storage.get(
+      'witSpeechApiKeys',
+      'sync'
+    );
+    return apiKeys[language];
+  }
+}
+
+async function getWitSpeechApiResult(apiKey, audioContent) {
+  const rsp = await fetch('https://api.wit.ai/speech', {
+    referrer: '',
+    mode: 'cors',
+    method: 'POST',
+    headers: {
+      Authorization: 'Bearer ' + apiKey
+    },
+    body: new Blob([audioContent], {type: 'audio/wav'})
+  });
+
+  if (rsp.status !== 200) {
+    throw new Error(`API response: ${rsp.status}, ${await rsp.text()}`);
+  }
+
+  return (await rsp.json())._text.trim();
+}
+
+async function getIbmSpeechApiResult(apiUrl, apiKey, audioContent, language) {
+  const rsp = await fetch(
+    `${apiUrl}?model=${language}&profanity_filter=false`,
+    {
+      referrer: '',
+      mode: 'cors',
+      method: 'POST',
+      headers: {
+        Authorization: 'Basic ' + window.btoa('apiKey:' + apiKey),
+        'X-Watson-Learning-Opt-Out': 'true'
+      },
+      body: new Blob([audioContent], {type: 'audio/wav'})
+    }
+  );
+
+  if (rsp.status !== 200) {
+    throw new Error(`API response: ${rsp.status}, ${await rsp.text()}`);
+  }
+
+  const results = (await rsp.json()).results;
+  if (results && results.length) {
+    return results[0].alternatives[0].transcript.trim();
+  }
+}
+
+async function getMicrosoftSpeechApiResult(
+  apiUrl,
+  apiKey,
+  audioContent,
+  language
+) {
+  const rsp = await fetch(
+    `${apiUrl}?language=${language}&format=detailed&profanity=raw`,
+    {
+      referrer: '',
+      mode: 'cors',
+      method: 'POST',
+      headers: {
+        'Ocp-Apim-Subscription-Key': apiKey,
+        'Content-type': 'audio/wav; codec=audio/pcm; samplerate=16000'
+      },
+      body: new Blob([audioContent], {type: 'audio/wav'})
+    }
+  );
+
+  if (rsp.status !== 200) {
+    throw new Error(`API response: ${rsp.status}, ${await rsp.text()}`);
+  }
+
+  const results = (await rsp.json()).NBest;
+  if (results) {
+    return results[0].Lexical.trim();
+  }
+}
+
+async function transcribeAudio(audioUrl, lang) {
+  let solution;
+
+  const audioRsp = await fetch(audioUrl, {referrer: ''});
+  const audioContent = await prepareAudio(await audioRsp.arrayBuffer());
+
+  const {speechService, tryEnglishSpeechModel} = await storage.get(
+    ['speechService', 'tryEnglishSpeechModel'],
+    'sync'
+  );
+
+  if (['witSpeechApiDemo', 'witSpeechApi'].includes(speechService)) {
+    const language = captchaWitSpeechApiLangCodes[lang] || 'english';
+
+    const apiKey = await getWitSpeechApiKey(speechService, language);
+    if (!apiKey) {
+      showNotification({messageId: 'error_missingApiKey'});
+      return;
+    }
+
+    solution = await getWitSpeechApiResult(apiKey, audioContent);
+    if (!solution && language !== 'english' && tryEnglishSpeechModel) {
+      const apiKey = await getWitSpeechApiKey(speechService, 'english');
+      if (!apiKey) {
+        showNotification({messageId: 'error_missingApiKey'});
+        return;
+      }
+      solution = await getWitSpeechApiResult(apiKey, audioContent);
+    }
+  } else if (speechService === 'googleSpeechApi') {
+    const {googleSpeechApiKey: apiKey} = await storage.get(
+      'googleSpeechApiKey',
+      'sync'
+    );
+    if (!apiKey) {
+      showNotification({messageId: 'error_missingApiKey'});
+      return;
+    }
+    const apiUrl = `https://speech.googleapis.com/v1p1beta1/speech:recognize?key=${apiKey}`;
+
+    const language = captchaGoogleSpeechApiLangCodes[lang] || 'en-US';
+
+    const data = {
+      audio: {
+        content: arrayBufferToBase64(audioContent)
+      },
+      config: {
+        encoding: 'LINEAR16',
+        languageCode: language,
+        model: 'video',
+        sampleRateHertz: 16000
+      }
+    };
+    if (!['en-US', 'en-GB'].includes(language) && tryEnglishSpeechModel) {
+      data.config.model = 'default';
+      data.config.alternativeLanguageCodes = ['en-US'];
+    }
+
+    const rsp = await fetch(apiUrl, {
+      referrer: '',
+      mode: 'cors',
+      method: 'POST',
+      body: JSON.stringify(data)
+    });
+
+    if (rsp.status !== 200) {
+      throw new Error(`API response: ${rsp.status}, ${await rsp.text()}`);
+    }
+
+    const results = (await rsp.json()).results;
+    if (results) {
+      solution = results[0].alternatives[0].transcript.trim();
+    }
+  } else if (speechService === 'ibmSpeechApi') {
+    const {
+      ibmSpeechApiLoc: apiLoc,
+      ibmSpeechApiKey: apiKey
+    } = await storage.get(['ibmSpeechApiLoc', 'ibmSpeechApiKey'], 'sync');
+    if (!apiKey) {
+      showNotification({messageId: 'error_missingApiKey'});
+      return;
+    }
+    const apiUrl = ibmSpeechApiUrls[apiLoc];
+    const language =
+      captchaIbmSpeechApiLangCodes[lang] || 'en-US_BroadbandModel';
+
+    solution = await getIbmSpeechApiResult(
+      apiUrl,
+      apiKey,
+      audioContent,
+      language
+    );
+    if (
+      !solution &&
+      !['en-US_BroadbandModel', 'en-GB_BroadbandModel'].includes(language) &&
+      tryEnglishSpeechModel
+    ) {
+      solution = await getIbmSpeechApiResult(
+        apiUrl,
+        apiKey,
+        audioContent,
+        'en-US_BroadbandModel'
+      );
+    }
+  } else if (speechService === 'microsoftSpeechApi') {
+    const {
+      microsoftSpeechApiLoc: apiLoc,
+      microsoftSpeechApiKey: apiKey
+    } = await storage.get(
+      ['microsoftSpeechApiLoc', 'microsoftSpeechApiKey'],
+      'sync'
+    );
+    if (!apiKey) {
+      showNotification({messageId: 'error_missingApiKey'});
+      return;
+    }
+    const apiUrl = microsoftSpeechApiUrls[apiLoc];
+    const language = captchaMicrosoftSpeechApiLangCodes[lang] || 'en-US';
+
+    solution = await getMicrosoftSpeechApiResult(
+      apiUrl,
+      apiKey,
+      audioContent,
+      language
+    );
+    if (
+      !solution &&
+      !['en-US', 'en-GB'].includes(language) &&
+      tryEnglishSpeechModel
+    ) {
+      solution = await getMicrosoftSpeechApiResult(
+        apiUrl,
+        apiKey,
+        audioContent,
+        'en-US'
+      );
+    }
+  }
+
+  return solution;
+}
+
 async function onMessage(request, sender) {
  if (request.id === 'notification') {
    showNotification({
@ -147,6 +456,13 @@ async function onMessage(request, sender) {
    if ([30, 100].includes(useCount)) {
      await showContributePage('use');
    }
+  } else if (request.id === 'transcribeAudio') {
+    addBackgroundRequestLitener();
+    try {
+      return transcribeAudio(request.audioUrl, request.lang);
+    } finally {
+      removeBackgroundRequestLitener();
+    }
  } else if (request.id === 'resetCaptcha') {
    await resetCaptcha(sender.tab.id, sender.frameId, request.challengeUrl);
  } else if (request.id === 'getFramePos') {
--- a/src/solve/main.js
+++ b/src/solve/main.js
@ -1,24 +1,9 @@
 import browser from 'webextension-polyfill';
-import audioBufferToWav from 'audiobuffer-to-wav';

 import storage from 'storage/storage';
 import {meanSleep, pingClientApp} from 'utils/app';
-import {
-  getText,
-  waitForElement,
-  arrayBufferToBase64,
-  getRandomFloat,
-  sleep
-} from 'utils/common';
-import {
-  captchaGoogleSpeechApiLangCodes,
-  captchaIbmSpeechApiLangCodes,
-  captchaMicrosoftSpeechApiLangCodes,
-  captchaWitSpeechApiLangCodes,
-  ibmSpeechApiUrls,
-  microsoftSpeechApiUrls
-} from 'utils/data';
-import {clientAppVersion, witApiKeys} from 'utils/config';
+import {getText, waitForElement, getRandomFloat, sleep} from 'utils/common';
+import {clientAppVersion} from 'utils/config';

 let solverWorking = false;

@ -97,26 +82,6 @@ function isBlocked({timeout = 0} = {}) {
  return Boolean(document.querySelector(selector));
 }

-async function prepareAudio(audio) {
-  const ctx = new AudioContext();
-  const data = await ctx.decodeAudioData(audio);
-  await ctx.close();
-
-  const offlineCtx = new OfflineAudioContext(
-    // force mono output
-    1,
-    16000 * data.duration,
-    16000
-  );
-  const source = offlineCtx.createBufferSource();
-  source.buffer = data;
-  source.connect(offlineCtx.destination);
-  // discard 1.5 second noise from beginning/end
-  source.start(0, 1.5, data.duration - 3);
-
-  return audioBufferToWav(await offlineCtx.startRendering());
-}
-
 function dispatchEnter(node) {
  const keyEvent = {
    code: 'Enter',
@ -246,7 +211,7 @@ async function getElementScreenRect(node, browserBorder) {
    height /= osScale;
  }

-return {x, y, width, height};
+  return {x, y, width, height};
 }

 async function getClickPos(node, browserBorder) {
@ -258,94 +223,7 @@ async function getClickPos(node, browserBorder) {
  };
 }

-async function getWitSpeechApiKey(speechService, language) {
-  if (speechService === 'witSpeechApiDemo') {
-    return witApiKeys[language];
-  } else {
-    const {witSpeechApiKeys: apiKeys} = await storage.get(
-      'witSpeechApiKeys',
-      'sync'
-    );
-    return apiKeys[language];
-  }
-}
-
-async function getWitSpeechApiResult(apiKey, audioContent) {
-  const rsp = await fetch('https://api.wit.ai/speech', {
-    referrer: '',
-    mode: 'cors',
-    method: 'POST',
-    headers: {
-      Authorization: 'Bearer ' + apiKey
-    },
-    body: new Blob([audioContent], {type: 'audio/wav'})
-  });
-
-  if (rsp.status !== 200) {
-    throw new Error(`API response: ${rsp.status}, ${await rsp.text()}`);
-  }
-
-  return (await rsp.json())._text.trim();
-}
-
-async function getIbmSpeechApiResult(apiUrl, apiKey, audioContent, language) {
-  const rsp = await fetch(
-    `${apiUrl}?model=${language}&profanity_filter=false`,
-    {
-      referrer: '',
-      mode: 'cors',
-      method: 'POST',
-      headers: {
-        Authorization: 'Basic ' + window.btoa('apiKey:' + apiKey),
-        'X-Watson-Learning-Opt-Out': 'true'
-      },
-      body: new Blob([audioContent], {type: 'audio/wav'})
-    }
-  );
-
-  if (rsp.status !== 200) {
-    throw new Error(`API response: ${rsp.status}, ${await rsp.text()}`);
-  }
-
-  const results = (await rsp.json()).results;
-  if (results && results.length) {
-    return results[0].alternatives[0].transcript.trim();
-  }
-}
-
-async function getMicrosoftSpeechApiResult(
-  apiUrl,
-  apiKey,
-  audioContent,
-  language
-) {
-  const rsp = await fetch(
-    `${apiUrl}?language=${language}&format=detailed&profanity=raw`,
-    {
-      referrer: '',
-      mode: 'cors',
-      method: 'POST',
-      headers: {
-        'Ocp-Apim-Subscription-Key': apiKey,
-        'Content-type': 'audio/wav; codec=audio/pcm; samplerate=16000'
-      },
-      body: new Blob([audioContent], {type: 'audio/wav'})
-    }
-  );
-
-  if (rsp.status !== 200) {
-    throw new Error(`API response: ${rsp.status}, ${await rsp.text()}`);
-  }
-
-  const results = (await rsp.json()).NBest;
-  if (results) {
-    return results[0].Lexical.trim();
-  }
-}
-
 async function solve(simulateUserInput, clickEvent) {
-  let solution;
-
  if (isBlocked()) {
    return;
  }
@ -435,158 +313,13 @@ async function solve(simulateUserInput, clickEvent) {
  }

  const audioUrl = audioEl.src;
-
  const lang = document.documentElement.lang;
-  const audioRsp = await fetch(audioUrl, {referrer: ''});
-  const audioContent = await prepareAudio(await audioRsp.arrayBuffer());
-
-  const {speechService, tryEnglishSpeechModel} = await storage.get(
-    ['speechService', 'tryEnglishSpeechModel'],
-    'sync'
-  );

-  if (['witSpeechApiDemo', 'witSpeechApi'].includes(speechService)) {
-    const language = captchaWitSpeechApiLangCodes[lang] || 'english';
-
-    const apiKey = await getWitSpeechApiKey(speechService, language);
-    if (!apiKey) {
-      browser.runtime.sendMessage({
-        id: 'notification',
-        messageId: 'error_missingApiKey'
-      });
-      return;
-    }
-
-    solution = await getWitSpeechApiResult(apiKey, audioContent);
-    if (!solution && language !== 'english' && tryEnglishSpeechModel) {
-      const apiKey = await getWitSpeechApiKey(speechService, 'english');
-      if (!apiKey) {
-        browser.runtime.sendMessage({
-          id: 'notification',
-          messageId: 'error_missingApiKey'
-        });
-        return;
-      }
-      solution = await getWitSpeechApiResult(apiKey, audioContent);
-    }
-  } else if (speechService === 'googleSpeechApi') {
-    const {googleSpeechApiKey: apiKey} = await storage.get(
-      'googleSpeechApiKey',
-      'sync'
-    );
-    if (!apiKey) {
-      browser.runtime.sendMessage({
-        id: 'notification',
-        messageId: 'error_missingApiKey'
-      });
-      return;
-    }
-    const apiUrl = `https://speech.googleapis.com/v1p1beta1/speech:recognize?key=${apiKey}`;
-
-    const language = captchaGoogleSpeechApiLangCodes[lang] || 'en-US';
-
-    const data = {
-      audio: {
-        content: arrayBufferToBase64(audioContent)
-      },
-      config: {
-        encoding: 'LINEAR16',
-        languageCode: language,
-        model: 'video',
-        sampleRateHertz: 16000
-      }
-    };
-    if (!['en-US', 'en-GB'].includes(language) && tryEnglishSpeechModel) {
-      data.config.model = 'default';
-      data.config.alternativeLanguageCodes = ['en-US'];
-    }
-
-    const rsp = await fetch(apiUrl, {
-      referrer: '',
-      mode: 'cors',
-      method: 'POST',
-      body: JSON.stringify(data)
-    });
-
-    if (rsp.status !== 200) {
-      throw new Error(`API response: ${rsp.status}, ${await rsp.text()}`);
-    }
-
-    const results = (await rsp.json()).results;
-    if (results) {
-      solution = results[0].alternatives[0].transcript.trim();
-    }
-  } else if (speechService === 'ibmSpeechApi') {
-    const {
-      ibmSpeechApiLoc: apiLoc,
-      ibmSpeechApiKey: apiKey
-    } = await storage.get(['ibmSpeechApiLoc', 'ibmSpeechApiKey'], 'sync');
-    if (!apiKey) {
-      browser.runtime.sendMessage({
-        id: 'notification',
-        messageId: 'error_missingApiKey'
-      });
-      return;
-    }
-    const apiUrl = ibmSpeechApiUrls[apiLoc];
-    const language =
-      captchaIbmSpeechApiLangCodes[lang] || 'en-US_BroadbandModel';
-
-    solution = await getIbmSpeechApiResult(
-      apiUrl,
-      apiKey,
-      audioContent,
-      language
-    );
-    if (
-      !solution &&
-      !['en-US_BroadbandModel', 'en-GB_BroadbandModel'].includes(language) &&
-      tryEnglishSpeechModel
-    ) {
-      solution = await getIbmSpeechApiResult(
-        apiUrl,
-        apiKey,
-        audioContent,
-        'en-US_BroadbandModel'
-      );
-    }
-  } else if (speechService === 'microsoftSpeechApi') {
-    const {
-      microsoftSpeechApiLoc: apiLoc,
-      microsoftSpeechApiKey: apiKey
-    } = await storage.get(
-      ['microsoftSpeechApiLoc', 'microsoftSpeechApiKey'],
-      'sync'
-    );
-    if (!apiKey) {
-      browser.runtime.sendMessage({
-        id: 'notification',
-        messageId: 'error_missingApiKey'
-      });
-      return;
-    }
-    const apiUrl = microsoftSpeechApiUrls[apiLoc];
-    const language = captchaMicrosoftSpeechApiLangCodes[lang] || 'en-US';
-
-    solution = await getMicrosoftSpeechApiResult(
-      apiUrl,
-      apiKey,
-      audioContent,
-      language
-    );
-    if (
-      !solution &&
-      !['en-US', 'en-GB'].includes(language) &&
-      tryEnglishSpeechModel
-    ) {
-      solution = await getMicrosoftSpeechApiResult(
-        apiUrl,
-        apiKey,
-        audioContent,
-        'en-US'
-      );
-    }
-  }
+  const solution = await browser.runtime.sendMessage({
+    id: 'transcribeAudio',
+    audioUrl,
+    lang
+  });

  if (!solution) {
    browser.runtime.sendMessage({
--- a/src/utils/data.js
+++ b/src/utils/data.js
@ -323,7 +323,7 @@ const ibmSpeechApiUrls = {
    'https://gateway-tok.watsonplatform.net/speech-to-text/api/v1/recognize'
 };

-// https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-apis#regions-and-endpoints
+// https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/rest-speech-to-text
 const microsoftSpeechApiUrls = {
  eastUs:
    'https://eastus.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1',