Skip to content

Dataset

Yasantha Niroshan edited this page Sep 29, 2024 · 1 revision

Data Preprocessing and Visualization

Data Preprocessing

Since the collected all data in raw format first I had to clean and preprocess the data.Collected CSV not needed much preprocessing.However audio data in text files has to be preprocessed and encoded to .wav files

Preprocessing Audio Files (Text)

Collected Audio files like to be as follow,

Starting Collecting Data 499,912,911,1206,1329,1430,1381,1637,1797,1698,1607,1860,

Note

You can find the all the raw data files in here

So first need to the remove the starting characters and commas

def process_txt_file(file_name:str,path:str)->np.ndarray:
    """
    This function processes the raw text file and returns the processed data.
    :param file_name: Name of the raw text file to be processed.
    :return: Processed data.
    """
    try:
        with open(os.path.join(path, file_name), 'r') as file:
            data = file.read()
        data = data.split('\n')[1].strip(",")
        string_data = data.strip()
        if len(string_data) == 0:
            return None
        audio_data:np.ndarray = np.array([int(sample.strip()) for sample in string_data.split(',')],dtype=np.int16)
        return audio_data
        
    except FileNotFoundError:
        print(f'File {file_name} not found in {path}')
        return None
    except IOError:
        print(f'Error reading file {file_name} in {path}')
        return None
    except ValueError as e:
        print(f'Error processing file {file_name} in {path}. Error: {e}')
        return None
    except Exception as e:
        print(f'Error reading file {file_name} in {path}. Error: {e}')
        return None

Encoding to .wav file

  • Sampling Rate - 1/1000 = 1 kHz
  • Resolution of ADC Converter - 12 bits
def write_wav_file(output_file:str, audio_data:np.ndarray, sample_rate:int=1000) -> None:
    """
    This function writes the audio data to a WAV file.
    :param output_file: Name of the WAV file to be written.
    :param audio_data: Audio data to be written to the WAV file.
    :param sample_rate: Sample rate of the audio data.
    """
    if audio_data is None or len(audio_data) == 0:
        print(f'No audio data to write for {output_file}')
        return
    
    try:
        audio_data_16bit = (audio_data << 4).astype(np.int16)
        
        with wave.open(output_file, 'wb') as wav_file:
            num_channels = 1  
            sampwidth = 2  
            num_frames = len(audio_data)
            
            wav_file.setnchannels(num_channels)
            wav_file.setsampwidth(sampwidth)
            wav_file.setframerate(sample_rate)
            wav_file.writeframes(audio_data_16bit.tobytes())
            
    except FileExistsError:
        print(f'File {output_file} already exists. Skipping.')
    except IOError as e:
        print(f'IOError writing file {output_file}: {e}')
    except Exception as e:
        print(f'Error writing file {output_file}: {e}')

Writing to .wav Files

Then above preprocessed files need to write to files

text_files:list = list_all_raw_txt_files(RAW_DATA_PATH)
total_processed_files:int = 0
total_files:int = len(text_files)

if text_files is None:
    print(f'No text files found in {RAW_DATA_PATH}')
    exit(0)
else:
    print(f'Found {len(text_files)} text files in {RAW_DATA_PATH}')

print(f'Processing text files...')
for file in text_files:
    print("-"*50)
    print(f'Processing {file}...')
    audio_data = process_txt_file(file,RAW_DATA_PATH)
    if audio_data is None:
        print(f'Error processing {file}. No data found in the file. Skipping...')
        continue
    print(f"Found {len(audio_data)} samples")
    audi_file_path = os.path.join(PREPROCESSED_AUDIO_FILES_PATH, file.split('.')[0] + '.wav')
    write_wav_file(audi_file_path, audio_data)
    print(f'Processed {file} and saved as {file.split(".")[0]}.wav')
    total_processed_files += 1

Tip

You can find the all encoded audio files in here

Preprocessing csv files

CSV File only needs a Header because already raw data in csv

def process_csv_file(file_name:str)->dict:
    """
    This function processes the CSV file and returns the processed data in the form of a Dictionary.
    :param file_name: Name of the CSV file to be processed.
    :return: Dictionary containing the processed data.
    """
    processed_data = []

    with open(RAW_CSV, 'r') as file:
        csv_dict_reader = csv.DictReader(file, fieldnames=FIELDS)
        for row in csv_dict_reader:
            processed_data.append(row)

    return processed_data


def write_csv_file(file_name:str, data:dict)->None:
    """
    This function writes the processed data to a CSV file.
    :param file_name: Name of the CSV file to be written.
    :param data: Processed data to be written to the CSV file.
    """
    with open(file_name, 'w') as file:
        csv_dict_writer = csv.DictWriter(file, fieldnames=FIELDS)
        csv_dict_writer.writeheader()
        for row in data:
            csv_dict_writer.writerow(row)

write_csv_file(PREPROCESSED_CSV, process_csv_file(RAW_CSV))

Data Visualization