From 705128b0f0dbc82db9e7b90aa4103eab9a5ce10e Mon Sep 17 00:00:00 2001 From: Richard Levitte Date: Wed, 2 Oct 2019 13:16:48 +0200 Subject: util/find-doc-nits: more precise option and function name checker The checks for our uses of 'B<' and 'I<' for options, and possibly function names, was over-reaching quite a bit. So we fine-tune it a bit: - by only checking for options in man1 pages, and only in SYNOPSIS and *OPTIONS sections. - by only checking for function names in man3 pages. The man1 option checker has the additional check that options found in *OPTIONS are also found in SYNOPSIS andd vice versa. In all cases, this also handles options and function names with additional markup, such as 'B<-I>' and 'B_push>'. Reviewed-by: Matt Caswell (Merged from https://github.com/openssl/openssl/pull/10073) --- util/find-doc-nits | 171 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 154 insertions(+), 17 deletions(-) diff --git a/util/find-doc-nits b/util/find-doc-nits index 67a2ee365c..9c5826d05f 100755 --- a/util/find-doc-nits +++ b/util/find-doc-nits @@ -18,6 +18,8 @@ use Getopt::Std; use lib catdir(dirname($0), "perl"); use OpenSSL::Util::Pod; +my $debug = 0; # Set to 1 for debug output + # Options. our($opt_d); our($opt_e); @@ -203,6 +205,150 @@ sub check_head_style { } } +# Because we have options and symbols with extra markup, we need +# to take that into account, so we need a regexp that extracts +# markup chunks, including recursive markup. +# please read up on /(?R)/ in perlre(1) +# (note: order is important, (?R) needs to come before .) +# (note: non-greedy is important, or something like 'B and B' +# will be captured as one item) +my $markup_re = + qr/( # Capture group + [BIL]< # The start of what we recurse on + (?:(?-1)|.)*? # recurse the whole regexp (refering to + # the last opened capture group, i.e. the + # start of this regexp), or pick next + # character. Do NOT be greedy! + > # The end of what we recurse on + )/x; # (the x allows this sort of split up regexp) + +# Options must start with a dash, followed by a letter, possibly +# followed by letters, digits, dashes and underscores, and the last +# character must be a letter or a digit. +# We do also accept the single -? or -n, where n is a digit +my $option_re = + qr/(?: + \? # Single question mark + | + \d # Single digit + | + - # Single dash (--) + | + [[:alpha:]](?:[-_[:alnum:]]*?[[:alnum:]])? + )/x; + +# Helper function to check if a given $thing is properly marked up +# option. It returns one of these values: +# +# undef if it's not an option +# "" if it's a malformed option +# $unwrapped the option with the outermost B<> wrapping removed. +sub normalise_option { + my $id = shift; + my $filename = shift; + my $thing = shift; + + my $unwrapped = $thing; + my $unmarked = $thing; + + # $unwrapped is the option with the outer B<> markup removed + $unwrapped =~ s/^B$//; + # $unmarked is the option with *all* markup removed + $unmarked =~ s/[BIL]<|>//msg; + + + # If we found an option, check it, collect it + if ( $unwrapped =~ /^\s*-/ ) { + return $unwrapped # return option with outer B<> removed + if $unmarked =~ /^-${option_re}$/; + return ""; # Malformed option + } + return undef; # Something else +} + +# Checks of command option (man1) formatting. The man1 checks are +# restricted to the SYNOPSIS and OPTIONS sections, the rest is too +# free form, we simply cannot be too strict there. + +sub option_check { + my $id = shift; + my $filename = shift; + my $contents = shift; + + my $synopsis = ($contents =~ /=head1\s+SYNOPSIS(.*?)=head1/s, $1); + + # Some pages have more than one OPTIONS section, let's make sure + # to get them all + my $options = ''; + while ( $contents =~ /=head1\s+[A-Z ]*?OPTIONS$(.*?)(?==head1)/msg ) { + $options .= $1; + } + + # Look for options with no or incorrect markup + while ( $synopsis =~ + /(?[:alnum:]])/msg ) { + err($id, "Malformed option [1] in SYNOPSIS: $&"); + } + + while ( $synopsis =~ /$markup_re/msg ) { + my $found = $&; + print STDERR "$id:DEBUG[option_check] SYNOPSIS: found $found\n" + if $debug; + my $option_uw = normalise_option($id, $filename, $found); + err($id, "Malformed option [2] in SYNOPSIS: $found") + if defined $option_uw && $option_uw eq ''; + } + + # In OPTIONS, we look for =item paragraphs. + # (?=^\s*$) detects an empty line. + while ( $options =~ /=item\s+(.*?)(?=^\s*$)/msg ) { + my $item = $&; + + while ( $item =~ /(\[\s*)?($markup_re)/msg ) { + my $found = $2; + print STDERR "$id:DEBUG[option_check] OPTIONS: found $&\n" + if $debug; + err($id, "Unexpected bracket in OPTIONS =item: $item") + if ($1 // '') ne '' && $found =~ /^B<\s*-/; + + my $option_uw = normalise_option($id, $filename, $found); + err($id, "Malformed option in OPTIONS: $found") + if defined $option_uw && $option_uw eq ''; + } + } +} + +# Normal symbol form +my $symbol_re = qr/[[:alpha:]_][_[:alnum:]]*?/; + +# Checks of function name (man3) formatting. The man3 checks are +# easier than the man1 checks, we only check the names followed by (), +# and only the names that have POD markup. + +sub functionname_check { + my $id = shift; + my $filename = shift; + my $contents = shift; + + while ( $contents =~ /($markup_re)\(\)/msg ) { + print STDERR "$id:DEBUG[functionname_check] SYNOPSIS: found $&\n" + if $debug; + + my $symbol = $1; + my $unmarked = $symbol; + $unmarked =~ s/[BIL]<|>//msg; + + err($id, "Malformed symbol: $symbol") + unless $symbol =~ /^B<.*>$/ && $unmarked =~ /^${symbol_re}$/ + } + + # We can't do the kind of collecting coolness that option_check() + # does, because there are too many things that can't be found in + # name repositories like the NAME sections, such as symbol names + # with a variable part (typically marked up as B_bar> +} + sub check { my $filename = shift; my $dirname = basename(dirname($filename)); @@ -225,9 +371,14 @@ sub check { check_section_location($id, $contents, "EXAMPLES", "SEE ALSO"); } - name_synopsis($id, $filename, $contents) - unless $contents =~ /=for comment generic/ - or $filename =~ m@man[157]/@; + unless ( $contents =~ /=for comment generic/ ) { + if ( $filename =~ m|man3/| ) { + name_synopsis($id, $filename, $contents); + functionname_check($id, $filename, $contents); + } elsif ( $filename =~ m|man1/| ) { + option_check($id, $filename, $contents) + } + } err($id, "doesn't start with =pod") if $contents !~ /^=pod/; @@ -255,20 +406,6 @@ sub check { if $contents =~ /=over([^ ][^24])/; err($id, "Possible version style issue") if $contents =~ /OpenSSL version [019]/; - err($id, "Brackets on item line") - if $contents =~ /=item \[/; - if ( $contents !~ /=for comment generic/) { - # Some API pages have Bbar>. - err($id, "Bad flag formatting inside B<>") - if $contents =~ /B<-[A-Za-z_ ]+ /; - while ( $contents =~ /([BI])<([^>]*)>/g ) { - my $B = $1; - my $T = $2; - next if $T =~ /E - err($id, "Bad content inside $B<$T>") - if $T =~ /[<|]/; - } - } if ( $contents !~ /=for comment multiple includes/ ) { # Look for multiple consecutive openssl #include lines -- cgit v1.2.3