luisfmelo
4/11/2017 - 8:25 AM

Check the 1000 most used words in subtitles

Check the 1000 most used words in subtitles

#!/bin/bash

cat ~/friends/* ~/himym/* |     # Get all subtitles for friends and himym series
grep -v '^[0-9]' |              # Remove lines that startrs with numbers
sed 's/[?!.|]/ /g' |            # Replace all the question marks, exclamation marks, dots and pipes with spaces.
sed 's/ * / /g' |               # Replace all the multiple spaces with single spaces.
sed "s/[^A-Za-z\' ]//g" |       # Remove everything, but alphabetical characters, apostrophes and spaces.
sed 's/^ //g' |                 # Remove all the unnecessary spaces from the beginning of the lines.
sed 's/ /\n/g' |                # Turn every spaces into line-breaks (means we will have one word per line in our file).
sed '/^$/d' |                   # Remove all the empty lines.
sed -e 's/\(.*\)/\L\1/' |       # Turn every uppercase characters into lowercase characters.
sort |                          # Sort the words into alphabetical order.
uniq -c |                       # Make a list from every unique word and print the number of occurrences next to them.
sort -rn |                      # Order this word-list by the number of occurrences. (The most occurrences the top of the list.)
sed -e 's/^[ \t]*//' > all_data.csv # Remove spaces from the beginning of the lines.

# Print the top 1000 lines (top 1000 used words) into a file called 1000.csv.
cat all_data.csv |head -1000 > 1000.csv 

# Total Word count
cat all_data.csv |head -1000 |awk '{sum += $1} END {print sum}'

# Number of ocurrences of the top 1000 words
cat all_data.csv |awk '{sum += $1} END {print sum}'