2e:["$","$L2f",null,{"course":{"id":221,"title":"Applied Speech Recognition","meta_title":"Applied Speech Recognition Course | ASR with Python","meta_description":"Build speech recognition systems with Python and deep learning. Learn ASR pipelines, from audio processing and feature extraction to model training and deployment.","description":"

This course provides a comprehensive guide to building and implementing automatic speech recognition (ASR) systems. It covers the complete workflow, starting from audio signal processing and feature extraction to constructing and training modern deep learning models. Participants will work with contemporary tools and architectures, such as LSTMs, Transformers, and Connectionist Temporal Classification (CTC) loss, to build functional speech-to-text pipelines. The material is designed for engineers and developers with a foundation in machine learning who want to develop practical skills in the speech technology domain.

","short_description":"Build functional speech-to-text pipelines using modern deep learning architectures and tools.","excerpt":"Build and deploy automatic speech recognition systems. This course covers audio processing, feature extraction, deep learning models for ASR, and performance evaluation using Python and modern frameworks.","prerequisites":"Python & ML foundation","svg_icon":"","cover_color":"pink","learning_outcomes":[{"topic":"Audio Preprocessing","description":"Preprocess and prepare audio data for ASR models."},{"topic":"Feature Extraction","description":"Implement feature extraction techniques like MFCCs and Log-Mel Spectrograms."},{"topic":"Acoustic Modeling","description":"Build and train acoustic models using RNNs, LSTMs, and Transformers."},{"topic":"Language Modeling","description":"Integrate language models into the decoding process for improved accuracy."},{"topic":"System Evaluation","description":"Evaluate and benchmark ASR system performance using standard metrics like WER."},{"topic":"Deployment","description":"Construct a functional speech-to-text application pipeline."}],"duration":24,"slug":"applied-speech-recognition","level":2,"category":"Machine Learning","is_masterclass":false,"has_reviewed":false,"added_to_roadmap":false,"has_purchased":false,"has_project":false,"superseded_by":null,"created_at":"2025-07-18T05:37:06.849368Z","updated_at":"2026-01-07T02:28:49.696199Z","chapters":[{"id":1238,"title":"Foundations of Digital Audio and Speech","meta_title":"Digital Audio and Speech Processing Fundamentals","meta_description":"Learn the fundamentals of digital audio signals, phonetics, and time-frequency analysis for speech recognition applications.","number":1,"slug":"foundations-digital-audio-speech","content":"$30","sections":[{"id":7082,"title":"Introduction to Automatic Speech Recognition Systems","meta_title":"Introduction to ASR Systems","meta_description":"An overview of the components and architecture of a typical Automatic Speech Recognition (ASR) system.","slug":"introduction-to-asr-systems","order":1,"has_completed":false,"has_bookmarked":false},{"id":7084,"title":"Properties of Human Speech: Phonemes and Allophones","meta_title":"Properties of Human Speech","meta_description":"Understand the basic linguistic units of speech, including phonemes, allophones, and their importance in ASR.","slug":"properties-human-speech","order":2,"has_completed":false,"has_bookmarked":false},{"id":7086,"title":"Digital Audio Signals: Sampling, Quantization, and Encoding","meta_title":"Digital Audio Signals Explained","meta_description":"Learn about the process of converting analog sound into digital signals through sampling, quantization, and encoding.","slug":"digital-audio-signals","order":3,"has_completed":false,"has_bookmarked":false},{"id":7088,"title":"Working with Audio Data in Python using Librosa","meta_title":"Processing Audio in Python with Librosa","meta_description":"A guide to using the Librosa library in Python for loading, manipulating, and analyzing audio files.","slug":"working-with-audio-data-python","order":4,"has_completed":false,"has_bookmarked":false},{"id":7090,"title":"Time and Frequency Domain Analysis","meta_title":"Time and Frequency Domain Analysis","meta_description":"Explore the differences between time-domain and frequency-domain representations of audio signals.","slug":"time-frequency-domain-analysis","order":5,"has_completed":false,"has_bookmarked":false},{"id":7102,"title":"Introduction to Spectrograms for Speech Visualization","meta_title":"Introduction to Spectrograms","meta_description":"Learn how to create and interpret spectrograms to visualize the frequency content of speech over time.","slug":"introduction-to-spectrograms","order":6,"has_completed":false,"has_bookmarked":false},{"id":7104,"title":"Hands-on Practical: Loading and Visualizing Audio Waveforms","meta_title":"Practice: Visualizing Audio with Python","meta_description":"A hands-on session on loading audio files in Python and generating waveform and spectrogram plots.","slug":"practice-visualizing-audio","order":7,"has_completed":false,"has_bookmarked":false}],"has_completed":false,"has_quiz":false,"has_passed_quiz":false},{"id":1241,"title":"Feature Extraction for Speech Recognition","meta_title":"Speech Feature Extraction Techniques | MFCC & Spectrograms","meta_description":"Learn to extract meaningful features from audio signals, including MFCCs and log-mel spectrograms, for use in ASR models.","number":2,"slug":"feature-extraction-for-speech","content":"$31","sections":[{"id":7107,"title":"The Role of Feature Extraction in ASR","meta_title":"The Role of Feature Extraction in ASR","meta_description":"Understand why raw audio is not ideal for ASR models and the purpose of feature extraction.","slug":"role-of-feature-extraction","order":1,"has_completed":false,"has_bookmarked":false},{"id":7109,"title":"Mel Frequency Cepstral Coefficients (MFCCs)","meta_title":"Mel Frequency Cepstral Coefficients (MFCCs)","meta_description":"A detailed explanation of MFCCs, a standard feature set for speech recognition tasks.","slug":"mel-frequency-cepstral-coefficients","order":2,"has_completed":false,"has_bookmarked":false},{"id":7111,"title":"Calculating MFCCs Step-by-Step","meta_title":"How to Calculate MFCCs","meta_description":"A step-by-step walkthrough of the MFCC calculation process, from pre-emphasis to the final coefficients.","slug":"calculating-mfccs","order":3,"has_completed":false,"has_bookmarked":false},{"id":7113,"title":"Filter Banks and Log-Mel Spectrograms","meta_title":"Filter Banks and Log-Mel Spectrograms","meta_description":"Explore log-mel spectrograms as an alternative and often more effective feature for modern ASR models.","slug":"filter-banks-log-mel-spectrograms","order":4,"has_completed":false,"has_bookmarked":false},{"id":7115,"title":"Feature Normalization Techniques","meta_title":"Speech Feature Normalization","meta_description":"Learn about feature normalization methods like Cepstral Mean and Variance Normalization (CMVN) to improve model stability.","slug":"feature-normalization-techniques","order":5,"has_completed":false,"has_bookmarked":false},{"id":7117,"title":"Comparing MFCCs and Spectrograms as Input Features","meta_title":"MFCCs vs Spectrograms for ASR","meta_description":"A comparison of the pros and cons of using MFCCs versus log-mel spectrograms for training deep learning models.","slug":"comparing-mfccs-and-spectrograms","order":6,"has_completed":false,"has_bookmarked":false},{"id":7119,"title":"Practice: Extracting and Normalizing Features from a Dataset","meta_title":"Practice: Feature Extraction with Python","meta_description":"A hands-on session on writing Python code to extract, normalize, and save features from an entire audio dataset.","slug":"practice-extracting-features","order":7,"has_completed":false,"has_bookmarked":false}],"has_completed":false,"has_quiz":false,"has_passed_quiz":false},{"id":1243,"title":"Acoustic Modeling with Deep Neural Networks","meta_title":"Acoustic Modeling with RNNs, LSTMs, and CTC Loss","meta_description":"Learn to build acoustic models for ASR using Recurrent Neural Networks (RNNs), LSTMs, and the Connectionist Temporal Classification (CTC) loss function.","number":3,"slug":"acoustic-modeling-deep-learning","content":"$32","sections":[{"id":7121,"title":"Overview of Acoustic Models in ASR","meta_title":"Overview of Acoustic Models","meta_description":"Understand the function of an acoustic model: to map audio features to phonetic or character probabilities.","slug":"overview-of-acoustic-models","order":1,"has_completed":false,"has_bookmarked":false},{"id":7124,"title":"Building Acoustic Models with Recurrent Neural Networks","meta_title":"Acoustic Models with RNNs","meta_description":"Learn how the sequential nature of RNNs makes them suitable for processing speech feature sequences.","slug":"acoustic-models-with-rnns","order":2,"has_completed":false,"has_bookmarked":false},{"id":7125,"title":"Addressing Sequential Challenges with LSTMs and GRUs","meta_title":"LSTMs and GRUs for Speech Recognition","meta_description":"Use LSTMs and GRUs to handle long-range dependencies in speech and mitigate the vanishing gradient problem.","slug":"addressing-challenges-with-lstms-grus","order":3,"has_completed":false,"has_bookmarked":false},{"id":7128,"title":"Connectionist Temporal Classification (CTC) Loss","meta_title":"Connectionist Temporal Classification (CTC) Loss","meta_description":"A detailed look at the CTC loss function, which enables training of ASR models without pre-aligned data.","slug":"connectionist-temporal-classification-ctc","order":4,"has_completed":false,"has_bookmarked":false},{"id":7130,"title":"Implementing a CTC-based ASR Model","meta_title":"Implementing a CTC-based ASR Model","meta_description":"A guide to structuring an end-to-end ASR model architecture that uses CTC for training.","slug":"implementing-ctc-based-asr-model","order":5,"has_completed":false,"has_bookmarked":false},{"id":7132,"title":"Hands-on Practical: Training a Simple LSTM Acoustic Model with CTC","meta_title":"Practice: Training an LSTM Model with CTC","meta_description":"A hands-on coding session to build and train a basic LSTM-CTC acoustic model on a speech dataset.","slug":"practice-training-lstm-ctc-model","order":6,"has_completed":false,"has_bookmarked":false}],"has_completed":false,"has_quiz":false,"has_passed_quiz":false},{"id":1245,"title":"Advanced Acoustic Models and Architectures","meta_title":"Advanced ASR Models: Transformers & Pre-trained Models","meta_description":"Explore advanced ASR architectures including sequence-to-sequence models with attention, Transformers, and how to use pre-trained models like Wav2Vec 2.0.","number":4,"slug":"advanced-acoustic-models","content":"$33","sections":[{"id":7135,"title":"Attention Mechanisms for Speech Recognition","meta_title":"Attention Mechanisms for Speech Recognition","meta_description":"Learn how attention mechanisms allow the model to focus on relevant parts of the audio sequence during transcription.","slug":"attention-mechanisms-for-speech","order":1,"has_completed":false,"has_bookmarked":false},{"id":7137,"title":"Sequence-to-Sequence (Seq2Seq) Models for ASR","meta_title":"Sequence-to-Sequence (Seq2Seq) Models for ASR","meta_description":"Understand the encoder-decoder structure of Seq2Seq models and their application in end-to-end ASR.","slug":"seq2seq-models-for-asr","order":2,"has_completed":false,"has_bookmarked":false},{"id":7139,"title":"Listen, Attend, and Spell (LAS) Architecture","meta_title":"Listen, Attend, and Spell (LAS) Architecture","meta_description":"A deep look into the LAS architecture, a pioneering Seq2Seq model for speech recognition.","slug":"listen-attend-spell-architecture","order":3,"has_completed":false,"has_bookmarked":false},{"id":7141,"title":"Introduction to Transformer Models for ASR","meta_title":"Transformer Models for Speech Recognition","meta_description":"Adapt the Transformer architecture, with its self-attention mechanism, for speech recognition tasks.","slug":"transformer-models-for-asr","order":4,"has_completed":false,"has_bookmarked":false},{"id":7143,"title":"Conformer: Combining CNNs and Transformers","meta_title":"Conformer ASR Architecture","meta_description":"Learn about the Conformer model, which combines convolutions and self-attention to capture both local and global dependencies.","slug":"conformer-architecture","order":5,"has_completed":false,"has_bookmarked":false},{"id":7145,"title":"An Overview of Pre-trained ASR Models","meta_title":"Pre-trained ASR Models (Wav2Vec, Whisper)","meta_description":"An introduction to large-scale pre-trained models like Wav2Vec 2.0 and Whisper and the concept of self-supervised learning for speech.","slug":"overview-pretrained-asr-models","order":6,"has_completed":false,"has_bookmarked":false},{"id":7147,"title":"Practice: Fine-tuning a Pre-trained ASR Model","meta_title":"Practice: Fine-tuning a Pre-trained ASR Model","meta_description":"A hands-on session on fine-tuning a pre-trained model from the Hugging Face library on a specific speech dataset.","slug":"practice-finetuning-pretrained-model","order":7,"has_completed":false,"has_bookmarked":false}],"has_completed":false,"has_quiz":false,"has_passed_quiz":false},{"id":1247,"title":"Language Modeling and Decoding","meta_title":"Language Models and Decoding for ASR | Beam Search","meta_description":"Integrate language models to improve transcription accuracy and learn about decoding algorithms like greedy search and beam search.","number":5,"slug":"language-modeling-decoding","content":"$34","sections":[{"id":7150,"title":"The Function of Language Models in ASR","meta_title":"Function of Language Models in ASR","meta_description":"Understand how language models add linguistic context to the acoustic model's output to produce more coherent text.","slug":"function-of-language-models","order":1,"has_completed":false,"has_bookmarked":false},{"id":7151,"title":"N-gram Language Models","meta_title":"N-gram Language Models","meta_description":"A look at traditional n-gram statistical language models and their role in ASR systems.","slug":"n-gram-language-models","order":2,"has_completed":false,"has_bookmarked":false},{"id":7152,"title":"Building an N-gram Model with KenLM","meta_title":"Building N-gram Models with KenLM","meta_description":"A practical guide to training your own n-gram language model from a text corpus using the KenLM toolkit.","slug":"building-n-gram-model-kenlm","order":3,"has_completed":false,"has_bookmarked":false},{"id":7153,"title":"Decoding Graphs for Model Integration","meta_title":"Decoding Graphs for ASR","meta_description":"Learn how decoding graphs combine information from the acoustic model, lexicon, and language model.","slug":"decoding-graphs-model-integration","order":4,"has_completed":false,"has_bookmarked":false},{"id":7154,"title":"Decoding Algorithms: Greedy Search vs Beam Search","meta_title":"Greedy Search vs Beam Search Decoding","meta_description":"Compare the simple greedy search decoding algorithm with the more effective beam search approach.","slug":"decoding-algorithms-greedy-beam-search","order":5,"has_completed":false,"has_bookmarked":false},{"id":7155,"title":"Implementing Beam Search with a Language Model","meta_title":"Implementing Beam Search with a Language Model","meta_description":"Understand the implementation details of a beam search decoder that incorporates scores from an external language model.","slug":"implementing-beam-search-with-lm","order":6,"has_completed":false,"has_bookmarked":false},{"id":7156,"title":"Hands-on Practical: Integrating a Language Model into a CTC Decoder","meta_title":"Practice: Integrating a Language Model with a CTC Decoder","meta_description":"A hands-on session to add an n-gram language model to a CTC-based beam search decoder to improve results.","slug":"practice-integrating-language-model","order":7,"has_completed":false,"has_bookmarked":false}],"has_completed":false,"has_quiz":false,"has_passed_quiz":false},{"id":1248,"title":"Evaluating and Deploying ASR Systems","meta_title":"Evaluate and Deploy ASR Systems | WER & CER Metrics","meta_description":"Learn to evaluate ASR models using Word Error Rate (WER), apply data augmentation, and deploy a simple speech-to-text application.","number":6,"slug":"evaluating-deploying-asr-systems","content":"$35","sections":[{"id":7157,"title":"Metrics for ASR Performance: WER and CER","meta_title":"ASR Performance Metrics: WER and CER","meta_description":"Understand the standard metrics for evaluating speech recognition systems: Word Error Rate (WER) and Character Error Rate (CER).","slug":"asr-performance-metrics-wer-cer","order":1,"has_completed":false,"has_bookmarked":false},{"id":7158,"title":"Calculating Word Error Rate","meta_title":"How to Calculate Word Error Rate (WER)","meta_description":"A detailed explanation of how to calculate WER, including substitutions, deletions, and insertions.","slug":"calculating-word-error-rate","order":2,"has_completed":false,"has_bookmarked":false},{"id":7159,"title":"Common Data Augmentation Techniques for Speech","meta_title":"Speech Data Augmentation Techniques","meta_description":"Learn techniques for augmenting audio data, such as adding noise, changing pitch, and modifying tempo, to create a more generalized model.","slug":"data-augmentation-for-speech","order":3,"has_completed":false,"has_bookmarked":false},{"id":7160,"title":"Using Hugging Face Pipelines for ASR","meta_title":"Using Hugging Face Pipelines for ASR","meta_description":"A guide to using the high-level Hugging Face `pipeline` API for quick and easy speech-to-text inference.","slug":"hugging-face-pipelines-for-asr","order":4,"has_completed":false,"has_bookmarked":false},{"id":7161,"title":"Building a Speech-to-Text Application with Gradio","meta_title":"Build a Speech-to-Text App with Gradio","meta_description":"A step-by-step tutorial on wrapping your ASR model in a simple web interface using the Gradio library.","slug":"building-stt-app-gradio","order":5,"has_completed":false,"has_bookmarked":false},{"id":7162,"title":"Considerations for Real-time Streaming ASR","meta_title":"Real-time Streaming ASR Considerations","meta_description":"An overview of the challenges and architectural changes needed to build an ASR system for real-time, streaming audio input.","slug":"real-time-streaming-asr","order":6,"has_completed":false,"has_bookmarked":false},{"id":7163,"title":"Practice: Evaluating and Building a Demo Application","meta_title":"Practice: Evaluate Model and Build a Demo App","meta_description":"A final hands-on session to calculate the WER of your trained model and package it into a shareable web application.","slug":"practice-evaluating-and-building-demo","order":7,"has_completed":false,"has_bookmarked":false}],"has_completed":false,"has_quiz":false,"has_passed_quiz":false}],"project":null},"chapter":{"id":1241,"title":"Feature Extraction for Speech Recognition","number":2,"meta_title":"Speech Feature Extraction Techniques | MFCC & Spectrograms","meta_description":"Learn to extract meaningful features from audio signals, including MFCCs and log-mel spectrograms, for use in ASR models.","content":"$36"}}]

Chapter 2: Feature Extraction for Speech Recognition

Sections