24 Sep 2022
As part of my project Transcribee I need to clean up transcripts where beginning of lines can have an unexpected number of unwanted characters.
Goal is to clean non-alphabetical characters, and capitalise the first letter.
Here is what I came up with - not sure if it's the most pythonic way though but it works:
def clean_beginning_string(string_input):
valid = False
for i in range(1,21): # run enough time
if valid == False: # run as long as 1st character is not alphabetical
first_letter = string_input[0]
if not first_letter.isalpha():
string_input = string_input[1:]
else:
valid = True
else:
break # break loop once 1st character is alphabetical
# Capitalise
if not string_input[0].isupper():
string_output = string_input.replace(string_input[0], string_input[0].upper(), 1) # replace only first occurence of character with capital
else:
string_output = string_input
return string_output
Test:
list_to_clean = [
' This is',
' another beginning',
' for the first time',
' ?or is it',
'?perhaps not',
'.who knows',
]
def clean_beginning_string(string_input):
valid = False
for i in range(1,21): # run enough time
if valid == False: # run as long as 1st character is not alphabetical
first_letter = string_input[0]
if not first_letter.isalpha():
string_input = string_input[1:]
else:
valid = True
else:
break # break loop once 1st character is alphabetical
# Capitalise
if not string_input[0].isupper():
string_output = string_input.replace(string_input[0], string_input[0].upper(), 1) # replace only first occurence of character with capital
else:
string_output = string_input
return string_output
for line in list_to_clean:
print(clean_beginning_string(line))
outputs:
This is
Another beginning
For the first time
Or is it
Perhaps not
Who knows