(* "datai" consists of: *)
(* te_num: number of training examples *)
(* pf_no: hash: pfeature -> number of times in the training examples *)
(* cn_no: hash: contrapositive -> number of times in the training examples (tfreq) *)
(* cn_pf_no: hash: contrapositive -> map pfeature -> (cooccurrences, tdidf) (sfreq) *)

open Cnf;;
open Fof;;
open Fof_parse;;
open Features;;

let speclist = [
  ("-feanosubst", Arg.Clear fea_undersubst, "\t\tWhen computing features do not descend in substitution");
  ("-feanoconst", Arg.Clear fea_constpred, "\t\tDo not include constant features");
  ("-feasubterm", Arg.Set fea_subterm, "\t\tInclude subterm features");
  ("-feaweaken", Arg.Set_float Features.weaken_feature, "\t\tFactor to multiply next feature on path (0.0--1.0) default 0.8");
];;

Arg.parse speclist (fun _ -> ()) "Usage: ./hasher  [options]\nAvailable options are:";;

let l = Fof_lexer.data_file "data";;
Printf.printf "Read data\n%!";;

let te_num, pf_no, cn_no, cn_pf_no = try
  let ic = open_in "./datai" in
  let te_num = ((input_value ic) : int) in
  Printf.printf "Read: %i training examples\n%!" te_num;
  let pf_no = ((input_value ic) : ((int * term list), float) Hashtbl.t) in
  Printf.printf "Read: %i features\n%!" (Hashtbl.length pf_no);
  let cn_no = ((input_value ic) : (int, int) Hashtbl.t) in
  Printf.printf "Read: %i contras\n%!" (Hashtbl.length cn_no);
  let cn_pf_no = ((input_value ic) : (int, (float * float) Fm.t) Hashtbl.t) in
  close_in ic;
  ref te_num, pf_no, cn_no, cn_pf_no
with Sys_error _ -> ref 0, Hashtbl.create 1000, Hashtbl.create 1000, Hashtbl.create 1000
| _ -> failwith "Error reading 'datai'";;

let iter_fun1 (_, lit, pat, _, nlit, npat, _) =
  let lit, pat = if !fea_undersubst then lit, pat else nlit, npat in
  incr te_num;
  let wfea = path_features (lit :: pat) in
  Fm.iter (fun k w -> Hashtbl.replace pf_no k (w +. try Hashtbl.find pf_no k with Not_found -> 0.)) wfea
;;
List.iter iter_fun1 l;;
Printf.printf "Feature frequencies updated\n%!";;

let iter_fun2 (co, lit, pat, lem, nlit, npat, nlem) =
  let lit, pat, lem = if !fea_undersubst then lit, pat, lem else nlit, npat, nlem in
  let cn = md5s (string_of_form (rename_unbound co)) in
  Hashtbl.replace cn_no cn (1 + try Hashtbl.find cn_no cn with Not_found -> 0);
  let wfea = path_features (lit :: pat) in
  (*Printf.printf "%i " (Im.cardinal wfea); Im.iter (fun f _ -> Printf.printf "%s " (str_of_fea f)) wfea; Printf.printf "\n";*)
  let om = try Hashtbl.find cn_pf_no cn with Not_found -> Fm.empty in
  let fold_fun f w sf =
    let ov = try fst (Fm.find f sf) with Not_found -> 0. in
    Fm.add f (ov +. w, 0.) sf
  in
  Hashtbl.replace cn_pf_no cn (Fm.fold fold_fun wfea om)
;;
List.iter iter_fun2 l;;
Printf.printf "Cn and Pf_Cn frequencies updated\n%!";;

let idf f = log (float_of_int !te_num) -. log (Hashtbl.find pf_no f);;

let iter_fun3 k =
  let ov = Hashtbl.find cn_pf_no k in
  let update_tfidf f (w, _) = (w, idf f) in
  let v = Fm.mapi update_tfidf ov in
  Hashtbl.replace cn_pf_no k v;;
let keys = Hashtbl.fold (fun k _ sf -> k :: sf) cn_pf_no [];;
List.iter iter_fun3 keys;;

let oc = open_out "datai" in
output_value oc !te_num;
output_value oc pf_no;
output_value oc cn_no;
output_value oc cn_pf_no;
close_out oc;;
Printf.printf "Wrote datai with %i training examples\n%!" !te_num;
