Merge pull request #64 from annika-magaro/main

final proposal fixed merge conflict
deep-learning-mit · Dec 16, 2023 · dd9dfad · dd9dfad
2 parents 12478a3 + 244c368
commit dd9dfad
Show file tree

Hide file tree

Showing 9 changed files with 164 additions and 6 deletions.
diff --git a/_posts/2023-11-10-speech-recognition-proposal.md b/_posts/2023-11-10-speech-recognition-proposal.md
diff --git a/assets/bibliography/2023-11-10-speech-recognition-proposal.bib b/assets/bibliography/2023-11-10-speech-recognition-proposal.bib
@@ -0,0 +1,76 @@
+@article{ardila2019,
+  title={Common voice: A massively-multilingual speech corpus},
+  author={Ardila, R. and Branson, M. and Davis, K. and Henretty, M. and Kohler, M. and Meyer, J., ... and Weber, G.},
+  journal={arXiv preprint arXiv:1912.06670},
+  year={2019}
+}
+@article{gemmeke2017,
+  title={Audio set: An ontology and human-labeled dataset for audio events},
+  author={Gemmeke, J. F. and Ellis, D. P. and Freedman, D. and Jansen, A. and Lawrence, W. and Moore, R. C., ... and Ritter, M.},
+  journal={IEEE international conference on acoustics, speech and signal processing (ICASSP) pp. 776-780},
+  year={2017}
+}
+@article{kell2018,
+  title={A Task-Optimized Neural Network Replicates Human Auditory Behavior, Predicts Brain Responses, and Reveals a Cortical Processing Hierarchy},
+  author={Kell, A.J.E. and Yamins, D.L.K. and Shook, E.N. and Norman-Haignere, S.V. and McDermott, J.H.},
+  journal={Neuron 98, 630-644.e16},
+  year={2018}
+}
+@article{mcdermott2011,
+  title={Sound texture perception via statistics of the auditory periphery: evidence from sound synthesis},
+  author={McDermott, J.H., and Simoncelli, E.P.},
+  journal={Neuron 71, 926–940},
+  year={2011}
+}
+@article{mcdermott2013,
+  title={Summary statistics in auditory perception},
+  author={McDermott, J.H. and Schemitsch, M., and Simoncelli, E.P.},
+  journal={Nat. Neurosci. 16, 493–498},
+  year={2013}
+}
+
+@article{orken2022,
+  title={A study of transformer-based end-to-end speech recognition system for Kazakh language},
+  author={Mamyrbayev, O. and Oralbekova, D. and Alimhan, K. and Turdalykyzy, T., and Othman, M.},
+  journal={Sci Rep 12, 8337},
+  year={2022}
+}
+
+@article{tuli2021,
+  title={Are Convolutional Neural Networks or Transformers more like human vision?},
+  author={Tuli, S. and Dasgupta, I. and Grant, E., and Griffiths, T.L.},
+  journal={arXiv preprint arXiv:2105.07197},
+  year={2021}
+}
+
+@article{saddler2021,
+  title={Deep neural network models reveal interplay of peripheral coding and stimulus statistics in pitch perception},
+  author={Saddler, M.R. and Gonzalez, R., and McDermott, J.H.},
+  journal={Nat. Commun. 12, 7278},
+  year={2021}
+}
+
+@article{feather2019,
+  title={Metamers of neural networks reveal divergence from
+human perceptual systems},
+  author={Feather, J. and Durango, A. and Gonzalez, R., and McDermott, J.H.},
+  journal={Advances in Neural Information Processing Systems 32},
+  year={2019}
+}
+
+@article{ml1950,
+  title={The intelligibility of interrupted speech},
+  author={Miller, G.A., and Licklider, J.C.R.},
+  journal={Journal of the Acoustical Society of America, 22, 167–173},
+  year={1950}
+}
+
+Ardila, R., Branson, M., Davis, K., Henretty, M., Kohler, M., Meyer, J., ... & Weber, G. (2019). Common voice: A massively-multilingual speech corpus. arXiv preprint arXiv:1912.06670.
+
+Gemmeke, J. F., Ellis, D. P., Freedman, D., Jansen, A., Lawrence, W., Moore, R. C., ... & Ritter, M. (2017, March). Audio set: An ontology and human-labeled dataset for audio events. In 2017 IEEE international conference on acoustics, speech and signal processing (ICASSP) pp. 776-780.
+
+Kell, A.J.E., Yamins, D.L.K., Shook, E.N., Norman-Haignere, S.V., McDermott, J.H. A Task-Optimized Neural Network Replicates Human Auditory Behavior, Predicts Brain Responses, and Reveals a Cortical Processing Hierarchy. Neuron. 2018 May 2; 98 (3): 630-644.e16.
+
+McDermott, J.H., and Simoncelli, E.P. (2011). Sound texture perception via statistics of the auditory periphery: evidence from sound synthesis. Neuron 71, 926–940.
+
+McDermott, J.H., Schemitsch, M., and Simoncelli, E.P. (2013). Summary statistics in auditory perception. Nat. Neurosci. 16, 493–498.
diff --git a/assets/img/2023-11-10-speech-recognition-proposal/accuracy-by-epoch.png b/assets/img/2023-11-10-speech-recognition-proposal/accuracy-by-epoch.png
diff --git a/assets/img/2023-11-10-speech-recognition-proposal/background-noise.png b/assets/img/2023-11-10-speech-recognition-proposal/background-noise.png
diff --git a/assets/img/2023-11-10-speech-recognition-proposal/block-architectures.png b/assets/img/2023-11-10-speech-recognition-proposal/block-architectures.png
diff --git a/assets/img/2023-11-10-speech-recognition-proposal/clean.png b/assets/img/2023-11-10-speech-recognition-proposal/clean.png
diff --git a/assets/img/2023-11-10-speech-recognition-proposal/manipulations-1.png b/assets/img/2023-11-10-speech-recognition-proposal/manipulations-1.png
diff --git a/assets/img/2023-11-10-speech-recognition-proposal/manipulations-2.png b/assets/img/2023-11-10-speech-recognition-proposal/manipulations-2.png
diff --git a/assets/img/2023-11-10-speech-recognition-proposal/network-architectures.png b/assets/img/2023-11-10-speech-recognition-proposal/network-architectures.png