Check the 1000 most used words in subtitles
#!/bin/bash
cat ~/friends/* ~/himym/* | # Get all subtitles for friends and himym series
grep -v '^[0-9]' | # Remove lines that startrs with numbers
sed 's/[?!.|]/ /g' | # Replace all the question marks, exclamation marks, dots and pipes with spaces.
sed 's/ * / /g' | # Replace all the multiple spaces with single spaces.
sed "s/[^A-Za-z\' ]//g" | # Remove everything, but alphabetical characters, apostrophes and spaces.
sed 's/^ //g' | # Remove all the unnecessary spaces from the beginning of the lines.
sed 's/ /\n/g' | # Turn every spaces into line-breaks (means we will have one word per line in our file).
sed '/^$/d' | # Remove all the empty lines.
sed -e 's/\(.*\)/\L\1/' | # Turn every uppercase characters into lowercase characters.
sort | # Sort the words into alphabetical order.
uniq -c | # Make a list from every unique word and print the number of occurrences next to them.
sort -rn | # Order this word-list by the number of occurrences. (The most occurrences the top of the list.)
sed -e 's/^[ \t]*//' > all_data.csv # Remove spaces from the beginning of the lines.
# Print the top 1000 lines (top 1000 used words) into a file called 1000.csv.
cat all_data.csv |head -1000 > 1000.csv
# Total Word count
cat all_data.csv |head -1000 |awk '{sum += $1} END {print sum}'
# Number of ocurrences of the top 1000 words
cat all_data.csv |awk '{sum += $1} END {print sum}'