#!/bin/bash
# Validate productconfig files from a directory one by one, then stitch them
# together, so the docserv script can build from them.
#
# Arguments (in order of appearance):
# --valid-languages="en-us ..."   # Space-separated list of valid language codes
#                                 # (optional)
# --make-positive                 # Turn translation document blacklist into
#                                 # whitelist (optional)
# INPUT_DIR                       # Directory with config files
# OUTPUT_FILE                     # Output file name (parent directory must exist)
#
# XML tool deps: xmlstarlet, jing, xmllint


out() {
  >&2 echo -e "$1"
  exit 1
}

me=$(test -L $(realpath $0) && readlink $(realpath $0) || echo $(realpath $0))
mydir=$(dirname $me)

source $mydir/docserv-dirs

app_help() {
  sed -rn '/#!/{n; p; :loop n; p; /^[ \t]*$/q; b loop}' $me | sed -r -e 's/^# ?//' -e "s/\\\$0/$(basename $0)/"
  exit
}

add_issue() {
  # we apply "fold" only to the first line, under the assumption that that is
  # the prose issue description while all the rest is assumed to not be foldable
  # (xml errors and such)
  issuelist+="\n\n- "$(echo -e "$1" | head -1 | fold -s -w 78 | sed '2,${s/^/  /}')
  [[ $(echo -e "$1" | wc -l) -gt 1 ]] && issuelist+='\n'$(echo -e "$1" | tail -n +2 | sed 's/^/  /')
}

xmllint='xmllint'
jing='jing'
starlet='xmlstarlet'

stacksize=${stacksize:-"-Xss4096K"}
java_flags="-Dorg.apache.xerces.xni.parser.XMLParserConfiguration=org.apache.xerces.parsers.XIncludeParserConfiguration"

schema_file=$share_dir/schema/config-validation.rnc
positive_xslt_file=$share_dir/xslt/positive-config.xsl

[[ ! -f $schema_file ]] && out "Schema $schema_file does not exist.$(readme_message)"
[[ ! -f $positive_xslt_file ]] && out "Schema $positive_xslt_file does not exist.$(readme_message)"

make_positive=0
valid_languages=""
unknown=""
input_dir=""
output_file=""

for i in "$@"
  do
    case $i in
      -h|--help)
        app_help
      ;;
      --make-positive)
        make_positive=1
      ;;
      --valid-languages=*)
        valid_languages=" ${i#*=}"
      ;;
      *)
        if [[ -d $(readlink -f $i) ]] && [[ ! $input_dir ]]; then
          input_dir=$(readlink -f $i)
        elif [[ -d $(readlink -f $(dirname $i)) ]] && [[ ! $output_file ]]; then
          output_file=$(readlink -f $i)
        else
          unknown+="  $i\n"
        fi
      ;;
    esac
done

[[ "$unknown" ]] && \
  out "There are unknown parameters:\n$unknown\n(Note that input and output directories must exist before running this script.)"

[[ $(echo "$valid_languages" | sed -r 's/( +[a-z]{2}(-[a-z]{2,8})?)*//') ]] && \
  out "Language codes parameter string does not conform to scheme (must be la-ng scheme, with entries space-separated)."


cd $input_dir

# "all" is actually only recognized for the extralinks tags. Adding it here is
# bit dirty, because if we did not also validate with the RNG, we'd be allowing
# invalid values for lang within builddocs elements. Though, as long as we have
# both, there is no harm done.
valid_languages_sorted=$(echo -e "all $valid_languages" | tr ' ' '\n' | sort -u)

issuelist=''

for file in *.xml; do

  # xmllint is faster and gives more readable error messages than jing, so
  # run that first

  wellformed=$(2>&1 xmllint --noout --noent $file)
  [[ $? -gt 0 ]] && add_issue "$input_dir/$file: File is not well-formed:\n$wellformed" && continue

  valid=$(2>&1 ADDITIONAL_FLAGS="$java_flags" ADDITIONAL_OPTIONS="$java_flags" \
    $jing -ci $schema_file $file)
  [[ $? -gt 0 ]] && add_issue "$input_dir/$file: File is not valid:\n$valid"

  # all format tags need at least one format attribute set to "1" or "true"

  format_issues=$($starlet sel -t -c "//format[not(@*='true') and not(@*='1')]" $file)
  count_format_issues=$(echo -e "$format_issues" | sed 's/>/>\n/g' | wc -l)
  # FIXME: welp! terribly unhelpful error message here
  [[ "$format_issues" ]] && \
    add_issue \
      "$input_dir/$file: There is/are $count_format_issues format element(s) where no attribute is set to \"true\" or \"1\"."

  # we want to allow IDs that start with a digit, hence we can't use RelaxNG's
  # xsd:ID data type, but we can use xmlstarlet to check

  setids=$($starlet sel -t -v "//@setid" $file | sort)
  uniquesetids=$(echo -e "$setids" | sort -u)
  [[ ! "$setids" == "$uniquesetids" ]] && \
    add_issue \
      "$input_dir/$file: Some setid values are not unique. Check for occurrences of the following duplicated setid(s): "$(comm -2 -3 <(echo -e "$setids") <(echo -e "$uniquesetids") | tr '\n' ' ')"."


  # make sure each language code appears only once within a given set
  for set in $(seq 1 $(echo -e "$setids" | wc -l)); do
    langcodes=$($starlet sel -t -v "//docset["$set"]/builddocs/language/@lang" $file | sort)
    uniquelangcodes=$(echo -e "$langcodes" | sort -u)
    [[ ! "$langcodes" == "$uniquelangcodes" ]] && \
    add_issue \
      "$input_dir/$file: Some language elements within a set have non-unique lang attributes. Check for occurrences of the following duplicated lang attribute(s) in docset \""$($starlet sel -t -v "//docset["$set"]/@setid" $file)"\": "$(comm -2 -3 <(echo -e "$langcodes") <(echo -e "$uniquelangcodes") | tr '\n' ' ')"."
  done


  # make sure each dc appears only once within a language
  languages=$($starlet sel -t -v "count(//language)" $file | sort)
  for language in $(seq 1 "$languages"); do
    currentlanguage=$($starlet sel -t -c "(//language)["$language"]" $file)
    langcode=$(echo -e "$currentlanguage" | $starlet sel -t -v "(//@lang)[1]" $file)
    setid=$($starlet sel -t -v "(//language)["$language"]/ancestor::docset/@setid" $file)
    dcs=$(echo -e "$currentlanguage" | $starlet sel -t -v "//dc" | sort)
    uniquedcs=$(echo -e "$dcs" | sort -u)
    [[ ! "$dcs" == "$uniquedcs" ]] && \
    add_issue \
      "$input_dir/$file: Some dc elements within a language have non-unique values. Check for occurrences of the following duplicated lang attribute(s) in docset=$setid/language=$langcode: "$(comm -2 -3 <(echo -e "$dcs") <(echo -e "$uniquedcs") | tr '\n' ' ')"."
  done

  # make sure each subdeliverable appears only once within a dc
  deliverables=$($starlet sel -t -v "count(//deliverable)" $file | sort)
  for deliverable in $(seq 1 $deliverables); do
    currentdeliverable=$($starlet sel -t -c "(//deliverable)["$deliverable"]" $file)
    [[ ! $(echo -e "$currentdeliverable" | $starlet sel -t -c "(//subdeliverable)") ]] && continue
    langcode=$($starlet sel -t -v "(//deliverable)["$deliverable"]/ancestor::language/@lang" $file)
    setid=$($starlet sel -t -v "(//deliverable)["$deliverable"]/ancestor::docset/@setid" $file)
    dc=$($starlet sel -t -v "(//deliverable)["$deliverable"]/dc" $file)
    subdeliverables=$(echo -e "$currentdeliverable" | $starlet sel -t -v "//subdeliverable" | sort)
    uniquesubdeliverables=$(echo -e "$subdeliverables" | sort -u)
    [[ ! "$subdeliverables" == "$uniquesubdeliverables" ]] && \
    add_issue \
      "$input_dir/$file: Some subdeliverable elements within a deliverable have non-unique values. Check for occurrences of the following duplicated subdeliverable(s) in docset=$setid/language=$langcode/dc=$dc: "$(comm -2 -3 <(echo -e "$subdeliverables") <(echo -e "$uniquesubdeliverables") | tr '\n' ' ')"."
  done


  if [[ "$valid_languages" ]]; then
    languages=$($starlet sel -t -v "//@lang" $file | sort -u)
    unrecognized_languages=$(comm -1 -3 <(echo -e "$valid_languages_sorted") <(echo -e "$languages"))
    [[ "$unrecognized_languages" ]] && \
      add_issue \
        "$input_dir/$file: Some lang attributes are not supported by your configuration INI. Check for occurrences of the following unsupported lang attribute(s):\n$unrecognized_languages"
  fi


done

if [[ "$issuelist" ]]; then
  out "The following issues occured when validating:$issuelist\n"
fi

outfile='<?xml version="1.0" encoding="UTF-8"?>\n<docservconfig>\n\n'

for file in *.xml; do
  outfile+=$($starlet sel -t -c "/*" $file)
  outfile+='\n'
done

outfile+='\n</docservconfig>\n'

# check for uniqueness of productid values (which we can't do before
# stitching the file together)

productids=$(echo -e "$outfile" | $starlet sel -t -v '//@productid' | sort)
uniqueproductids=$(echo -e "$productids" | sort -u)

[[ ! "$productids" == "$uniqueproductids" ]] && \
    out \
      "Some productid values in $input_dir are not unique. Check the productid(s): "$(comm -2 -3 <(echo -e "$productids") <(echo -e "$uniqueproductids") | tr '\n' ' ')"."

cd - >/dev/null


if [[ $make_positive == 1 ]]; then
  echo -e "$outfile" | xsltproc $positive_xslt_file - > $output_file
else
  echo -e "$outfile" > $output_file
fi
