diff --git a/Makefile b/Makefile index 1f0f336..d2ea529 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ EXE = \ check_base/check_base.exe \ + compact/compact.exe \ dag2html/main.exe \ gwFix/gwFixBase.exe \ gwFix/gwFixBurial.exe \ diff --git a/compact/README.MD b/compact/README.MD new file mode 100644 index 0000000..3ea645f --- /dev/null +++ b/compact/README.MD @@ -0,0 +1,22 @@ +# geneweb-compact + +A tool to remove unused values from your database. + +It only support old legacy gwbd format. + +## Why? + +When modifiying a text, deleting a person or a family, GeneWeb does +not actually delete data, it just remove the links to this data, +instead. + +It can be a security problem, if you though you erased sensitive data +from a note which is still present in the database. + +Also, as the time goes by, leftover data can grow waste some space en +resources. + +## What? + +Current stage of the tool only report and dump leftover data. Next +step is to actually perfom a database cleanup. diff --git a/compact/compact.ml b/compact/compact.ml new file mode 100644 index 0000000..2808f26 --- /dev/null +++ b/compact/compact.ml @@ -0,0 +1,177 @@ +open Geneweb +open Def +open Dbdisk + +let fast = ref true +let step_strings = ref false +let step_persons = ref false +let step_families = ref false + +let load_array a = if !fast then a.load_array () +let clear_array a = a.clear_array () + +let split_sname base i = Mutil.split_sname @@ base.data.strings.get i +let split_fname base i = Mutil.split_fname @@ base.data.strings.get i + +let scan base string person family = + let mark t i = try t.(i) <- true with _ -> failwith @@ string_of_int i in + if !step_strings then begin + load_array base.data.strings ; + let rev = Hashtbl.create base.data.strings.len in + for i = 0 to base.data.strings.len - 1 do + Hashtbl.add rev (base.data.strings.get i) i + done ; + let opt_mark t s = match Hashtbl.find_opt rev s with Some x -> mark t x | None -> () in + let t = Array.make base.data.strings.len false in + load_array base.data.persons ; + for i = 0 to base.data.persons.len - 1 do + let p = base.data.persons.get i in + mark t p.first_name ; + List.iter (opt_mark t) (split_fname base p.first_name) ; + mark t p.surname ; + List.iter (opt_mark t) (split_sname base p.surname) ; + opt_mark t @@ Name.concat (base.data.strings.get p.first_name) (base.data.strings.get p.surname) ; + mark t p.image ; + mark t p.public_name ; + List.iter (mark t) p.qualifiers ; + List.iter (mark t) p.aliases ; + List.iter (mark t) p.first_names_aliases ; + List.iter (mark t) p.surnames_aliases ; + List.iter begin fun {t_name;t_ident;t_place; _ } -> + mark t t_ident ; + mark t t_place ; + (match t_name with Tname i -> mark t i | _ -> ()) ; + end p.titles ; + mark t p.occupation ; + mark t p.birth_place ; + mark t p.birth_note ; + mark t p.birth_src ; + mark t p.baptism_place ; + mark t p.baptism_note ; + mark t p.baptism_src ; + mark t p.death_place ; + mark t p.death_note ; + mark t p.death_src ; + mark t p.burial_place ; + mark t p.burial_note ; + mark t p.burial_src ; + mark t p.notes ; + mark t p.psources ; + List.iter (fun {r_sources;_} -> mark t r_sources) p.rparents ; + List.iter begin fun {epers_name;epers_place;epers_reason;epers_note;epers_src;_} -> + (match epers_name with Epers_Name i -> mark t i | _ -> ()) ; + mark t epers_place ; + mark t epers_reason ; + mark t epers_note ; + mark t epers_src ; + end p.pevents + done ; + clear_array base.data.persons ; + load_array base.data.families ; + for i = 0 to base.data.families.len - 1 do + let f = base.data.families.get i in + mark t f.marriage_place ; + mark t f.marriage_note ; + mark t f.marriage_src ; + mark t f.comment ; + mark t f.origin_file ; + mark t f.fsources ; + List.iter begin fun {efam_name;efam_place;efam_reason;efam_note;efam_src;_} -> + (match efam_name with Efam_Name i -> mark t i | _ -> ()) ; + mark t efam_place ; + mark t efam_reason ; + mark t efam_note ; + mark t efam_src ; + end f.fevents + done ; + clear_array base.data.families ; + clear_array base.data.strings ; + string base t + end ; + if !step_persons then begin + load_array base.data.persons ; + let t = Array.make base.data.persons.len false in + for i = 0 to base.data.persons.len - 1 do + if (base.data.persons.get i).key_index <> Gwdb1.dummy_iper then mark t i + done ; + clear_array base.data.persons ; + person base t + end ; + if !step_families then begin + load_array base.data.families ; + let t = Array.make base.data.families.len false in + for i = 0 to base.data.families.len - 1 do + if (base.data.families.get i).fam_index <> Gwdb1.dummy_ifam then mark t i + done ; + clear_array base.data.families ; + family base t + end + +let report base = + let aux fn _base t = + let cnt = ref 0 in + for i = 0 to Array.length t - 1 do if not t.(i) then incr cnt done ; + fn !cnt + in + scan + base + (aux @@ fun c -> Printf.printf "Number of unused strings: %d\n" c) + (aux @@ fun c -> Printf.printf "Number of ghost persons: %d\n" c) + (aux @@ fun c -> Printf.printf "Number of ghost families: %d\n" c) + +let dump base = + let dump_istr base t = + for i = 0 to Array.length t - 1 do + if not t.(i) then begin + Printf.printf "=== [START ISTR %d] ===\n%s\n=== [END ISTR %d] ===\n\n" i (base.data.strings.get i) i + end + done + in + let dump_iper _base t = + for i = 0 to Array.length t - 1 do + if not t.(i) then begin Printf.printf "Ghost person: %d\n" i end + done + in + let dump_ifam _base t = + for i = 0 to Array.length t - 1 do + if not t.(i) then begin Printf.printf "Ghost family: %d\n" i end + done + in + scan base dump_istr dump_iper dump_ifam + +let steps = ref "strings,persons,families" +let bname = ref "" +let action = ref None +let usage = "Usage: " ^ Sys.argv.(0) ^ " [OPTION] ACTIOn base" + +let speclist = + [ ( "-mem" + , Arg.Clear fast + , " slower, but use less memory" ) + ; ( "-step" + , Arg.Set_string steps + , " STEPS steps to perform. Default is " ^ !steps ) + ; ( "-report" + , Arg.Unit (fun () -> action := Some `report) + , " only report number of unused values" ) + ; ( "-dump" + , Arg.Unit (fun () -> action := Some `dump) + , " dump unuse values" ) + ] + +let _ = + Arg.parse speclist (fun s -> bname := s) usage ; + Secure.set_base_dir (Filename.dirname !bname) ; + if !bname = "" then begin Arg.usage speclist usage ; exit 2 end; + List.iter begin function + | "strings" -> step_strings := true + | "persons" -> step_persons := true + | "families" -> step_families := true + | _ -> Arg.usage speclist usage ; exit 2 + end (String.split_on_char ',' !steps) ; + Lock.control (Mutil.lock_file !bname) false ~onerror:Lock.print_try_again @@ fun () -> + let base = Gwdb1.OfGwdb.base (Gwdb.open_base !bname) in + match !action with + | None -> Arg.usage speclist usage ; exit 2 + | Some `dump -> dump base + | Some `report -> report base diff --git a/compact/dune b/compact/dune new file mode 100644 index 0000000..7fbf193 --- /dev/null +++ b/compact/dune @@ -0,0 +1,6 @@ +(executable + (name compact) + (public_name geneweb-compact) + (libraries unix str geneweb.gwdb1 geneweb.wserver geneweb) + (modules compact) +) diff --git a/compact/dune-project b/compact/dune-project new file mode 100644 index 0000000..2e16c24 --- /dev/null +++ b/compact/dune-project @@ -0,0 +1,2 @@ +(lang dune 1.10) +(name geneweb-compact) diff --git a/compact/geneweb-compact.opam b/compact/geneweb-compact.opam new file mode 100644 index 0000000..e69de29