neumachen
3/7/2017 - 12:47 AM

index.erl

-module(index).
-export([index_file/1,show_index/1,get_file_contents/1,show_file_contents/1]).

-include_lib("eunit/include/eunit.hrl").

%%% ASSIGNMENT 2.20 - INDEXING WORDS IN LINES FROM A TEXT FILE

% Example files available in:
%   gettysburg-address.txt (short)
%   dickens-christmas.txt  (long)

% THIS VERSION: "ATTACK OF THE LIST-CLONES!"
% NEXT VERSION: "LIST COMPREHENSIONS (AND FRIENDS) FIGHT BACK!"
%
% You are floating on the sun drench undulating waves of a vast ocean when
% you catch a golden glint from the depths. It's deep, very deep, but you
% are enticed by the thought of riches-beyond-reason and down you dive...

% STRATEGY
%
% Index creation:
%   - For each new word
%     - Find insertion point in Index (before/word-idx/after)
%     - Insert {Word,[LineNum]} into Index
%   - Each index to use linear list of line numbers,
%   - When source read, transform each list into list of {From,To} ranges.
%
% Stop Words:
%   - simple list to start with,
%   - candidates extracted from corpus based on length and/or freq,
%   - file-based, possibly based on std list generally used.
%
% Stemming:
%   - boolean switch for extended search
%   - simple version using word + common endings, eg, "s","er","ing", etc
%
% IMPROVEMENTS:
%   - tests
%   - more comments
%   - dict/map for word-based look-up
%   - list comprehensions (and anything else to shorten this program!)
%   - boolean search terms
%   - better output formatting
%   - include count of repeats per word per line for, eg, word-freq. Or...
%   - preserve file name for, eg, back-referencing, word-freq.


% "CONSTANTS"

% Stop Words
% Since words with 3 or less letters are excluded, only stop words with four or
% more letters are needed. TBD: candidates, file-based.
stop_words() -> ["have","come","that","those","here"]. % sample

% Stemming (TBD)
% stemmings() -> ["s","er","ing"]. % sample


% INDEXING

% Build Index: simply collect indexes, even repeats, eg,
%   { "foo" , [3,4,4,5,7,11,12,13] }

index_file(FileName) ->
  Lines = get_file_contents(FileName),
  condense_index([],index_lines(Lines)).


condense_index(NewIndex,[]) -> lists:reverse(NewIndex);
condense_index(NewIndex,[{Word,Lines}|Xs]) ->
  condense_index([{Word,condense_lines(Lines)}|NewIndex],Xs).

condense_lines([N]) -> [{N,N}];
condense_lines([X|Xs]) -> to_ranges([],{X,X},Xs).

to_ranges(R,{H,L},[]) -> [{L,H}|R];
to_ranges(R,{H,L},[H|Xs]) -> to_ranges(R,{H,L},Xs);
to_ranges(R,{H,L},[N|Xs]) ->
  case N==H-1 of
    true -> to_ranges(R,{N,L},Xs);
    _ -> to_ranges([{L,H}|R],{N,N},Xs)
  end.


% INDEXING - EXTRAS (TBD)

show_index([]) -> ok;
show_index([{Word,Lines}|Xs]) ->
  io:format("~s: ~w ~n",[Word,Lines]),
  show_index(Xs).


candidate_stop_words(MinLength,Index) ->
  candidate_stop_words(MinLength,Index,[]).

candidate_stop_words(_MinLength,[],Stops) -> Stops;
candidate_stop_words(MinLength,[L={Word,Lines}|Xs],Stops) ->
  case length(Word)>=MinLength of
    true -> candidate_stop_words(MinLength,Xs,[L|Stops]);
    _ -> candidate_stop_words(MinLength,Xs,Stops)
  end.


% search(Word,Stemming,Index) -> [].
% line_count(Word,Index) -> 0.


% You encounter a school of deadly jellyfish and slowly navigate your ways
% around them...


% INDEXING - HELPERS

% Lines

index_lines(Lines) -> index_lines(1,Lines,[]).

index_lines(_LineNum,[],Index) -> Index;
index_lines(LineNum,[[]|Lines],Index) -> index_lines(LineNum+1,Lines,Index);
index_lines(LineNum,[Line|Lines],Index) ->
  index_lines(LineNum+1,Lines,index_words(LineNum,words(Line),Index)).

index_words(_LineNum,[],Index) -> Index;
index_words(LineNum,[Word|Words],Index) ->
  index_words(LineNum,Words,upsert(LineNum,Word,Index)).

upsert(LineNum,Word,Index) ->
  join_no_reverse(upsert(LineNum,Word,[],Index)).

upsert(LineNum,Word,L0,[]) -> [L0,[{Word, [LineNum]}]];
upsert(LineNum,Word,L0,L1=[{AWord,_Lines}|_Xs]) when AWord>Word ->
  [L0,[{Word, [LineNum]}|L1]];
upsert(LineNum,Word,L0,[{Word,Lines}|Xs]) ->
  [L0,[{Word,[LineNum|Lines]}|Xs]]; % PRESERVING REPEATS FOR NOW!
upsert(LineNum,Word,L0,[X|Xs]) -> upsert(LineNum,Word,[X|L0],Xs).


% JOIN (one list to another)
join_no_reverse([L0,L1]) -> join_no_reverse(L0,L1).

join_no_reverse([],L1) -> L1;
join_no_reverse([X|Xs],L1) -> join_no_reverse(Xs,[X|L1]).


% Words

words(Line) ->
  % valid_words(string:tokens(only_alphas(Line)," "),[]).
  S = only_alphas(Line),
  % io:format("~p~n",[S]),
  Words = string:tokens(S," "),
  % io:format("~p~n",[Words]),
  valid_words(Words,[]).

valid_words([],Words) -> Words;
valid_words([Word|Xs],Words) ->
  case is_valid_word(Word) of
    true -> valid_words(Xs,[Word|Words]);
    _ -> valid_words(Xs,Words)
  end.

is_valid_word(Word) -> not_short_word(Word) andalso not_stop_word(Word).

not_short_word(Word) -> length(Word)>3.
not_stop_word(Word) -> not lists:member(Word,stop_words()).

% Below the jellyfish, you suddenly realise you have took too long and gone
% too deep. Your lungs begin to strain, you are doomed. The golden glint is
% still there so, resigned to your fate, you decide to carry on to see what
% you risked everything for...

% Alpha's & Spaces Filter: exclude all other chars (and extra spaces)

only_alphas(S) -> only_chars(lcase(S),[32|a2z()],[]).

only_chars([],_Chars,Acc) -> lists:reverse(Acc);
% only_Chars([],_Chars,Acc) -> Acc;
only_chars([" "|[" "|Xs]],Chars,Acc) -> only_chars([" "|Xs],Chars,Acc);
only_chars([X|Xs],Chars,Acc) ->
  case lists:member(X,Chars) of
    true -> only_chars(Xs,Chars,[X|Acc]);
    _ -> only_chars(Xs,Chars,Acc)
  end.

lcase(S) -> string:to_lower(S). % Cheat!

a2z() -> lists:seq($a,$z).

% Stems
stemmed(Word) -> stemmed(stemmings,[Word],Word).
stemmed([],Stems,_Word) -> Stems;
stemmed([X|Xs],Stems,Word) -> stemmed(Xs,[Word++X|Stems],Word).


% FILE HANDLING

% Get the contents of a text file into a list of lines.
% Each line has its trailing newline removed.
get_file_contents(Name) ->
    {ok,File} = file:open(Name,[read]),
    Rev = get_all_lines(File,[]),
    lists:reverse(Rev).

% Auxiliary function for get_file_contents (not exported).
get_all_lines(File,Partial) ->
    case io:get_line(File,"") of
        eof -> file:close(File),
               Partial;
        Line -> {Strip,_} = lists:split(length(Line)-1,Line),
                get_all_lines(File,[Strip|Partial])
    end.

% Show the contents of a list of strings.
% Can be used to check the results of calling get_file_contents.
show_file_contents([]) ->
    ok;
show_file_contents([L|Ls]) ->
    io:format("~s~n",[L]),
    show_file_contents(Ls).


% TESTS (TBD)

% Reaching the bottom you discover the golden glint is just the sun reflecting
% off an abandoned scuba-diving mask. Luckily it's attached to an air tank with
% just enough air to get you back to the surface. Phew!