#!/bin/bash # # xref link analysis # ================== # # Backstory # --------- # # xrefs are the way that our documentation can direct a reader to another part # of the documentation. They point to a specific refentry found in another file # in the source tree. By convention, all files have one refentry for the entire # page, but may also have refentries for specific sections within the page. # # References to whole pages make sense when the referenced page is small. # However, as the referened page grows in size, not all portions will apply # for the reference, cluttering the reader experience and making the reference # less valuable. Additionally, references to a whole page are considered # more suspicious when more granular refentries exist within that same page. # # Goal # ---- # # This script is intended to provide an ongoing way to determine which files # should be considered for creating more granular refentries. It identifies # files that have at least one external reference to the first (possibly only) # refentry, and then ranks those files by a formula that reflects the # likelihood that the file needs more granular refentries, or (more # importantly) already has more granular entries and they just aren't being # used. # # Formula # ------- # # Files with less than 200 lines are not considered. # # Files are ranked in declining order of N * L * M # # N: Number of references to the first refentry in the file. # L: Number of lines in the file. # M: Multiplier to raise rank of pages that have multiple refentries but # the first refentry is still being used. This is set to 2 if the file has # multiple refentries, and 1 if it does not. # # Future Directions # ----------------- # # We may want to add a filter to exclude files below a threshold line count, # because small pages by definition are fairly focused in their topic, and # the burden of reading the whole page is small. # # After this report has been used a few times, it will become obvious that a # few stubborn pages continue to rank high despite having been throughly # reviewed. We may want a mechanism to exclude them from the report. # # Developer Notes # --------------- # # The choice of perl regexps in grep is not ideal. However, xsltproc and xq # were emitting errors about claims of unmatched tags, extra characters # outside of tags, and various HTMLisms like — and other &-chars. # # The choice of weighting/ranking (number of references * size of file in # lines) is far from scientific, it was simply the first metric that came # to mind. The additional multiplier to highlight pages that already have # multiple refentries was an afterthought, and was added on to avoid # excluding pages with multiple refentries entirely. # # The choice of 200 lines as the minimum threshold for warranting more # refentries was based on sampling a few files and seeing how many # ideas were expressed in that much space. # # SQLite may seem like an odd choice given the subject matter, but it is a # quick way to correlate throwaway data. # # The file ecpg.sgml does not appear to have a top-level refentry which skews # the rank for what otherwise appears to be a highly granular xref. # set -eu # # location of the PG source tree sgml directory, change to suit # local configuration # pg_doc_dir="${HOME}/src/postgres/doc/src/sgml" # # get a line count for every sgml file # wc --lines "${pg_doc_dir}"/*.sgml "${pg_doc_dir}"/ref/*.sgml \ | grep -v 'total$' \ | sed -e 's/^ *//' -e "s#${pg_doc_dir}/##" -e 's/ /\t/' \ > file-line-counts.tsv # # count the number of times each link is referenced # cat "${pg_doc_dir}"/*.sgml "${pg_doc_dir}"/ref/*.sgml \ | grep --only-matching --perl-regexp '(?<=)' \ | grep --only-matching --perl-regexp '(?<=linkend=").*?(?=")' \ | sort | uniq --count | sort --numeric --reverse \ | sed -e 's/^ *//' -e 's/ /\t/' \ > link-counts.tsv # # map refentries to file names, numbering them so we know which was first # grep --only-matching --perl-regexp '(?<=)' "${pg_doc_dir}"/*.sgml "${pg_doc_dir}"/ref/*.sgml \ | sed -e "s#${pg_doc_dir}/##" -e 's/:.*id="/\t/' -e 's/".*//' \ | nl \ > reference-files.tsv # # combine the extrated data into a weighted report # sqlite3 << EOF CREATE TABLE file_line_counts( line_count integer, file_name text ); CREATE TABLE link_counts( link_count integer, link_name text); CREATE TABLE reference_files( line_number integer, file_name text, link_name text); .mode tabs .import file-line-counts.tsv file_line_counts .import link-counts.tsv link_counts .import reference-files.tsv reference_files .mode column WITH reference_files_ranked AS ( SELECT rf.file_name, rf.link_name, ROW_NUMBER() OVER (PARTITION BY rf.file_name ORDER BY rf.line_number) AS refentry_rank, COUNT(*) OVER (PARTITION BY rf.file_name) AS num_refentries FROM reference_files AS rf ) SELECT rfr.file_name, rfr.link_name, lc.link_count, flc.line_count, rfr.num_refentries FROM reference_files_ranked AS rfr JOIN link_counts AS lc ON lc.link_name = rfr.link_name JOIN file_line_counts AS flc ON flc.file_name = rfr.file_name WHERE rfr.refentry_rank = 1 AND flc.line_count >= 200 ORDER BY lc.link_count * flc.line_count * CASE rfr.num_refentries WHEN 1 THEN 1 ELSE 2 END DESC; EOF