function mg_spellcorrect_generate_hash
compile_opt strictarr, hidden
filename = filepath('big.txt', root=mg_src_root())
nlines = file_lines(filename)
text = strarr(nlines)
openr, lun, filename, /get_lun
readf, lun, text
free_lun, lun
text = strlowcase(text)
text = mg_strmerge(text)
ignore_ind = where(bindgen(256) lt 65 $
or (bindgen(256) gt 90 and bindgen(256) lt 97) $
or bindgen(256) gt 122)
ignore = string(byte(ignore_ind[1:*]))
words = strsplit(text, ignore, /extract)
nwords = n_elements(words)
model = hash()
start = 0L
for w = 0L, nwords - 1L do begin
word = strlowcase(words[w])
if ((w mod 10000) eq 0) then begin
print, 100. * w / nwords, w, nwords, $
format='(%"%0.1f\% completed: %d out of %d done")'
endif
if (model->hasKey(word)) then begin
model[word] = model[word] + 1L
endif else begin
model[word] = 1L
endelse
endfor
return, model
end
function mg_spellcorrect_edits1, word
compile_opt strictarr, hidden
splits = list()
for i = 0L, strlen(word) do begin
splits->add, { first: strmid(word, 0, i), second: strmid(word, i) }
endfor
edits1 = hash()
alphabet = list()
foreach char, bindgen(26) + (byte('a'))[0] do alphabet->add, string(char)
foreach s, splits do begin
edits1[s.first + strmid(s.second, 1)] = 1B
endforeach
foreach s, splits do begin
if (strlen(s.second) gt 1) then begin
edits1[s.first + strmid(s.second, 0, 1) + strmid(s.second, 1, 1) + strmid(s.second, 2)] = 1B
endif
endforeach
foreach s, splits do begin
foreach char, alphabet do begin
edits1[s.first + string(char) + strmid(s.second, 1)] = 1B
endforeach
endforeach
foreach s, splits do begin
foreach char, alphabet do begin
edits1[s.first + string(char) + s.second] = 1B
endforeach
endforeach
return, edits1
end
function mg_spellcorrect_known_edits2, word, known_hash=hash
compile_opt strictarr, hidden
edits2 = hash()
foreach v1, mg_spellcorrect_edits1(word), e1 do begin
foreach v2, mg_spellcorrect_edits1(e1), e2 do begin
if (hash->hasKey(e2)) then edits2[e2] = 1B
endforeach
endforeach
return, edits2
end
function mg_spellcorrect_known, words, known_hash=hash
compile_opt strictarr, hidden
known = hash()
foreach val, words, w do if (hash->hasKey(w)) then known[w] = 1B
return, known
end
function mg_spellcorrect, word, known_words=spell_hash, correct=correct
compile_opt strictarr
spell_hash_filename = filepath('spell_hash.sav', root=mg_src_root())
if (n_elements(spell_hash) eq 0L) then begin
if (file_test(spell_hash_filename)) then begin
restore, filename=spell_hash_filename
endif else begin
spell_hash = mg_spellcorrect_generate_hash()
save, spell_hash, filename=spell_hash_filename
endelse
endif
candidates = mg_spellcorrect_known(hash(word, 1B), known_hash=spell_hash)
if (n_elements(candidates) eq 0L) then begin
correct = 0B
candidates += mg_spellcorrect_known(mg_spellcorrect_edits1(word), known_hash=spell_hash)
endif else begin
correct = 1B
return, word
endelse
if (n_elements(candidates) eq 0L) then begin
candidates += mg_spellcorrect_known_edits2(word, known_hash=spell_hash)
endif
best_candidate = ''
highest_count = 0L
foreach v, candidates, c do begin
if (spell_hash[c] gt highest_count) then begin
best_candidate = c
highest_count = spell_hash[c]
endif
endforeach
return, best_candidate
end
words = ['corect', 'information', 'informtion', 'known']
foreach w, words do begin
corrected = mg_spellcorrect(w, known_words=spell_hash, correct=correct)
if (correct) then begin
print, w, format='(%"\"%s\" was spelled correctly")'
endif else begin
print, w, corrected, format='(%"\"%s\" should be spelled \"%s\"")'
endelse
endforeach
end