-module(index).
-export([index_file/1,show_index/1,get_file_contents/1,show_file_contents/1]).
-include_lib("eunit/include/eunit.hrl").
%%% ASSIGNMENT 2.20 - INDEXING WORDS IN LINES FROM A TEXT FILE
% Example files available in:
% gettysburg-address.txt (short)
% dickens-christmas.txt (long)
% THIS VERSION: "ATTACK OF THE LIST-CLONES!"
% NEXT VERSION: "LIST COMPREHENSIONS (AND FRIENDS) FIGHT BACK!"
%
% You are floating on the sun drench undulating waves of a vast ocean when
% you catch a golden glint from the depths. It's deep, very deep, but you
% are enticed by the thought of riches-beyond-reason and down you dive...
% STRATEGY
%
% Index creation:
% - For each new word
% - Find insertion point in Index (before/word-idx/after)
% - Insert {Word,[LineNum]} into Index
% - Each index to use linear list of line numbers,
% - When source read, transform each list into list of {From,To} ranges.
%
% Stop Words:
% - simple list to start with,
% - candidates extracted from corpus based on length and/or freq,
% - file-based, possibly based on std list generally used.
%
% Stemming:
% - boolean switch for extended search
% - simple version using word + common endings, eg, "s","er","ing", etc
%
% IMPROVEMENTS:
% - tests
% - more comments
% - dict/map for word-based look-up
% - list comprehensions (and anything else to shorten this program!)
% - boolean search terms
% - better output formatting
% - include count of repeats per word per line for, eg, word-freq. Or...
% - preserve file name for, eg, back-referencing, word-freq.
% "CONSTANTS"
% Stop Words
% Since words with 3 or less letters are excluded, only stop words with four or
% more letters are needed. TBD: candidates, file-based.
stop_words() -> ["have","come","that","those","here"]. % sample
% Stemming (TBD)
% stemmings() -> ["s","er","ing"]. % sample
% INDEXING
% Build Index: simply collect indexes, even repeats, eg,
% { "foo" , [3,4,4,5,7,11,12,13] }
index_file(FileName) ->
Lines = get_file_contents(FileName),
condense_index([],index_lines(Lines)).
condense_index(NewIndex,[]) -> lists:reverse(NewIndex);
condense_index(NewIndex,[{Word,Lines}|Xs]) ->
condense_index([{Word,condense_lines(Lines)}|NewIndex],Xs).
condense_lines([N]) -> [{N,N}];
condense_lines([X|Xs]) -> to_ranges([],{X,X},Xs).
to_ranges(R,{H,L},[]) -> [{L,H}|R];
to_ranges(R,{H,L},[H|Xs]) -> to_ranges(R,{H,L},Xs);
to_ranges(R,{H,L},[N|Xs]) ->
case N==H-1 of
true -> to_ranges(R,{N,L},Xs);
_ -> to_ranges([{L,H}|R],{N,N},Xs)
end.
% INDEXING - EXTRAS (TBD)
show_index([]) -> ok;
show_index([{Word,Lines}|Xs]) ->
io:format("~s: ~w ~n",[Word,Lines]),
show_index(Xs).
candidate_stop_words(MinLength,Index) ->
candidate_stop_words(MinLength,Index,[]).
candidate_stop_words(_MinLength,[],Stops) -> Stops;
candidate_stop_words(MinLength,[L={Word,Lines}|Xs],Stops) ->
case length(Word)>=MinLength of
true -> candidate_stop_words(MinLength,Xs,[L|Stops]);
_ -> candidate_stop_words(MinLength,Xs,Stops)
end.
% search(Word,Stemming,Index) -> [].
% line_count(Word,Index) -> 0.
% You encounter a school of deadly jellyfish and slowly navigate your ways
% around them...
% INDEXING - HELPERS
% Lines
index_lines(Lines) -> index_lines(1,Lines,[]).
index_lines(_LineNum,[],Index) -> Index;
index_lines(LineNum,[[]|Lines],Index) -> index_lines(LineNum+1,Lines,Index);
index_lines(LineNum,[Line|Lines],Index) ->
index_lines(LineNum+1,Lines,index_words(LineNum,words(Line),Index)).
index_words(_LineNum,[],Index) -> Index;
index_words(LineNum,[Word|Words],Index) ->
index_words(LineNum,Words,upsert(LineNum,Word,Index)).
upsert(LineNum,Word,Index) ->
join_no_reverse(upsert(LineNum,Word,[],Index)).
upsert(LineNum,Word,L0,[]) -> [L0,[{Word, [LineNum]}]];
upsert(LineNum,Word,L0,L1=[{AWord,_Lines}|_Xs]) when AWord>Word ->
[L0,[{Word, [LineNum]}|L1]];
upsert(LineNum,Word,L0,[{Word,Lines}|Xs]) ->
[L0,[{Word,[LineNum|Lines]}|Xs]]; % PRESERVING REPEATS FOR NOW!
upsert(LineNum,Word,L0,[X|Xs]) -> upsert(LineNum,Word,[X|L0],Xs).
% JOIN (one list to another)
join_no_reverse([L0,L1]) -> join_no_reverse(L0,L1).
join_no_reverse([],L1) -> L1;
join_no_reverse([X|Xs],L1) -> join_no_reverse(Xs,[X|L1]).
% Words
words(Line) ->
% valid_words(string:tokens(only_alphas(Line)," "),[]).
S = only_alphas(Line),
% io:format("~p~n",[S]),
Words = string:tokens(S," "),
% io:format("~p~n",[Words]),
valid_words(Words,[]).
valid_words([],Words) -> Words;
valid_words([Word|Xs],Words) ->
case is_valid_word(Word) of
true -> valid_words(Xs,[Word|Words]);
_ -> valid_words(Xs,Words)
end.
is_valid_word(Word) -> not_short_word(Word) andalso not_stop_word(Word).
not_short_word(Word) -> length(Word)>3.
not_stop_word(Word) -> not lists:member(Word,stop_words()).
% Below the jellyfish, you suddenly realise you have took too long and gone
% too deep. Your lungs begin to strain, you are doomed. The golden glint is
% still there so, resigned to your fate, you decide to carry on to see what
% you risked everything for...
% Alpha's & Spaces Filter: exclude all other chars (and extra spaces)
only_alphas(S) -> only_chars(lcase(S),[32|a2z()],[]).
only_chars([],_Chars,Acc) -> lists:reverse(Acc);
% only_Chars([],_Chars,Acc) -> Acc;
only_chars([" "|[" "|Xs]],Chars,Acc) -> only_chars([" "|Xs],Chars,Acc);
only_chars([X|Xs],Chars,Acc) ->
case lists:member(X,Chars) of
true -> only_chars(Xs,Chars,[X|Acc]);
_ -> only_chars(Xs,Chars,Acc)
end.
lcase(S) -> string:to_lower(S). % Cheat!
a2z() -> lists:seq($a,$z).
% Stems
stemmed(Word) -> stemmed(stemmings,[Word],Word).
stemmed([],Stems,_Word) -> Stems;
stemmed([X|Xs],Stems,Word) -> stemmed(Xs,[Word++X|Stems],Word).
% FILE HANDLING
% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
{ok,File} = file:open(Name,[read]),
Rev = get_all_lines(File,[]),
lists:reverse(Rev).
% Auxiliary function for get_file_contents (not exported).
get_all_lines(File,Partial) ->
case io:get_line(File,"") of
eof -> file:close(File),
Partial;
Line -> {Strip,_} = lists:split(length(Line)-1,Line),
get_all_lines(File,[Strip|Partial])
end.
% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([]) ->
ok;
show_file_contents([L|Ls]) ->
io:format("~s~n",[L]),
show_file_contents(Ls).
% TESTS (TBD)
% Reaching the bottom you discover the golden glint is just the sun reflecting
% off an abandoned scuba-diving mask. Luckily it's attached to an air tank with
% just enough air to get you back to the surface. Phew!