#!/bin/bash

#
# xref link analysis
# ==================
#
# Backstory
# ---------
#
# xrefs are the way that our documentation can direct a reader to another part
# of the documentation. They point to a specific refentry found in another file
# in the source tree. By convention, all files have one refentry for the entire
# page, but may also have refentries for specific sections within the page.
#
# References to whole pages make sense when the referenced page is small.
# However, as the referened page grows in size, not all portions will apply
# for the reference, cluttering the reader experience and making the reference
# less valuable. Additionally, references to a whole page are considered
# more suspicious when more granular refentries exist within that same page.
# 
# Goal
# ----
#
# This script is intended to provide an ongoing way to determine which files
# should be considered for creating more granular refentries. It identifies
# files that have at least one external reference to the first (possibly only)
# refentry, and then ranks those files by a formula that reflects the
# likelihood that the file needs more granular refentries, or (more
# importantly) already has more granular entries and they just aren't being
# used.
#
# Formula
# -------
#
# Files with less than 200 lines are not considered. 
#
# Files are ranked in declining order of N * L * M
#
# N: Number of references to the first refentry in the file.
# L: Number of lines in the file.
# M: Multiplier to raise rank of pages that have multiple refentries but
#    the first refentry is still being used. This is set to 2 if the file has
#    multiple refentries, and 1 if it does not.
#
# Future Directions
# -----------------
#
# We may want to add a filter to exclude files below a threshold line count,
# because small pages by definition are fairly focused in their topic, and
# the burden of reading the whole page is small.
#
# After this report has been used a few times, it will become obvious that a
# few stubborn pages continue to rank high despite having been throughly
# reviewed. We may want a mechanism to exclude them from the report.
#
# Developer Notes
# ---------------
# 
# The choice of perl regexps in grep is not ideal. However, xsltproc and xq
# were emitting errors about claims of unmatched tags, extra characters
# outside of tags, and various HTMLisms like &mdash; and other &-chars.
#
# The choice of weighting/ranking (number of references * size of file in
# lines) is far from scientific, it was simply the first metric that came
# to mind. The additional multiplier to highlight pages that already have
# multiple refentries was an afterthought, and was added on to avoid
# excluding pages with multiple refentries entirely.
#
# The choice of 200 lines as the minimum threshold for warranting more 
# refentries was based on sampling a few files and seeing how many
# ideas were expressed in that much space.
#
# SQLite may seem like an odd choice given the subject matter, but it is a 
# quick way to correlate throwaway data.
#
# The file ecpg.sgml does not appear to have a top-level refentry which skews
# the rank for what otherwise appears to be a highly granular xref.
#

set -eu

#
# location of the PG source tree sgml directory, change to suit
# local configuration
#
pg_doc_dir="${HOME}/src/postgres/doc/src/sgml"

#
# get a line count for every sgml file
#
wc --lines "${pg_doc_dir}"/*.sgml "${pg_doc_dir}"/ref/*.sgml \
    | grep -v 'total$' \
    | sed -e 's/^ *//' -e "s#${pg_doc_dir}/##" -e 's/ /\t/' \
    > file-line-counts.tsv

#
# count the number of times each link is referenced
#
cat "${pg_doc_dir}"/*.sgml "${pg_doc_dir}"/ref/*.sgml \
    | grep --only-matching --perl-regexp '(?<=<xref).*?(?=/>)' \
    | grep --only-matching --perl-regexp '(?<=linkend=").*?(?=")' \
    | sort | uniq --count | sort --numeric --reverse \
    | sed -e 's/^ *//' -e 's/ /\t/' \
    > link-counts.tsv

#
# map refentries to file names, numbering them so we know which was first
#
grep --only-matching --perl-regexp '(?<=<refentry ).*?(?=>)' "${pg_doc_dir}"/*.sgml "${pg_doc_dir}"/ref/*.sgml \
    | sed -e "s#${pg_doc_dir}/##" -e 's/:.*id="/\t/' -e 's/".*//' \
    | nl \
    > reference-files.tsv

#
# combine the extrated data into a weighted report
#
sqlite3 << EOF

CREATE TABLE file_line_counts( line_count integer, file_name text );
CREATE TABLE link_counts( link_count integer, link_name text);
CREATE TABLE reference_files( line_number integer, file_name text, link_name text);

.mode tabs
.import file-line-counts.tsv file_line_counts
.import link-counts.tsv link_counts
.import reference-files.tsv reference_files
.mode column

WITH reference_files_ranked AS (
    SELECT  rf.file_name, rf.link_name,
            ROW_NUMBER() OVER (PARTITION BY rf.file_name ORDER BY rf.line_number) AS refentry_rank,
            COUNT(*) OVER (PARTITION BY rf.file_name) AS num_refentries
    FROM reference_files AS rf
)
SELECT rfr.file_name, rfr.link_name, lc.link_count, flc.line_count, rfr.num_refentries
FROM reference_files_ranked AS rfr
JOIN link_counts AS lc ON lc.link_name = rfr.link_name
JOIN file_line_counts AS flc ON flc.file_name = rfr.file_name
WHERE rfr.refentry_rank = 1
AND flc.line_count >= 200
ORDER BY lc.link_count * flc.line_count * CASE rfr.num_refentries WHEN 1 THEN 1 ELSE 2 END DESC;

EOF
