From de84e9a9f6155ab7e38966aec038cb2b1dd7561c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 12:06:45 -0300 Subject: [PATCH 01/11] XML Entities --- scripts/dtdent-conv.php | 84 +++++++++ scripts/dtdent-split.php | 96 ++++++++++ scripts/entities.php | 390 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 570 insertions(+) create mode 100644 scripts/dtdent-conv.php create mode 100644 scripts/dtdent-split.php create mode 100644 scripts/entities.php diff --git a/scripts/dtdent-conv.php b/scripts/dtdent-conv.php new file mode 100644 index 0000000000..2b1dec2061 --- /dev/null +++ b/scripts/dtdent-conv.php @@ -0,0 +1,84 @@ + | ++----------------------------------------------------------------------+ +| Description: Convert old style .ent into new style .ent XML bundle. | ++----------------------------------------------------------------------+ + +See `entities.php` source for detailed rationale. + +Use this for converting bundled entities files that use into +XML version used by `entities.php`. + +After converting, add the generated entities in an global.ent or +manual.ent file, and delete the previous one. + +After all old style .ent files are split or converted, this script can +be removed. */ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +if ( count( $argv ) < 2 ) + die(" Syntax: php $argv[0] infile\n" ); + +$infile = $argv[1]; + +$content = file_get_contents( $infile ); + +$pos1 = 0; +while ( true ) +{ + $pos1 = strpos( $content , " DOMNodeList (ampunstand intended) + + $name = trim( $name ); + $text = str_replace( "&" , "&" , $text ); + + $frag = "\n"; + $frag .= " $text\n"; + $frag .= ''; + + $dom = new DOMDocument( '1.0' , 'utf8' ); + $dom->recover = true; + $dom->resolveExternals = false; + libxml_use_internal_errors( true ); + + $dom->loadXML( $frag , LIBXML_NSCLEAN ); + $dom->normalizeDocument(); + + libxml_clear_errors(); + + $text = $dom->saveXML( $dom->getElementsByTagName( "entity" )[0] ); + $text = str_replace( "&" , "&" , $text ); + + echo "$text\n"; +} diff --git a/scripts/dtdent-split.php b/scripts/dtdent-split.php new file mode 100644 index 0000000000..168e5aa890 --- /dev/null +++ b/scripts/dtdent-split.php @@ -0,0 +1,96 @@ + | ++----------------------------------------------------------------------+ +| Description: Split old style .ent file into individual files. | ++----------------------------------------------------------------------+ + +See `entities.php` source for detailed rationale. + +Use this for spliting `language-snippets-ent` or other "big" entities +files into individual .xml files. + +After spliting, add the new directory entities/ with they contents, +and remove `language-snippets-ent`, in one go. + +After all old style .ent files are split or converted, this script can +be removed. */ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +if ( count( $argv ) < 4 ) + die(" Syntax: php $argv[0] infile outdir [hash user]\n" ); + +$infile = $argv[1]; +$outdir = $argv[2]; +$hash = $argv[3] ?? ""; +$user = $argv[4] ?? "_"; + +$content = file_get_contents( $infile ); +$entities = []; + +// Parse + +$pos1 = 0; +while ( true ) +{ + $pos1 = strpos( $content , " $text ) +{ + $file = "$outdir/$name.xml"; + if ( file_exists( $file ) ) + exit( "Name colision: $file\n" ); +} + +// Write + +foreach( $entities as $name => $text ) +{ + $file = "$outdir/$name.xml"; + + $header = '' . "\n"; + + if ( $hash != "" ) + $header .= "\n"; + + file_put_contents( $file , $header . $text ); +} + +$total = count( $entities ); +print "Generated $total files.\n"; diff --git a/scripts/entities.php b/scripts/entities.php new file mode 100644 index 0000000000..3305f6a8d6 --- /dev/null +++ b/scripts/entities.php @@ -0,0 +1,390 @@ + | ++----------------------------------------------------------------------+ +| Description: Collect individual entities into an .entities.ent file. | ++----------------------------------------------------------------------+ + +# Mental model, or things that I would liked to know 20 years prior + +XML Entity processing has more in common with DOMDocumentFragment than +DOMElement. In other words, simple text and multi rooted XML files +are valid contents, whereas they are not valid XML documents. + +Also, namespaces do not automatically "cross" between a parent +document and their includes, even if they are included in the same +file, as local textual entities. s are, for all intended +purposes, separated documents, with separated namespaces and have +*expected* different default namespaces. + +So each one of, possibly multiple, "root" XML elements inside an +fragment need to be annotated with default namespace, even if the +"root" element occurs surrounded by text. For example: + +- "texttext", need one namespace, or it is invalid, and; +- "", need TWO namespaces, or it is also invalid. + +# Output + +This script collects bundled and individual entity files (detailed +below), at some expected relative paths, and generates an +.entities.ent file, in a sibling position to manual.xml.in. + +The output .entities.ent file has no duplications, so collection +order is important to keep the necessary operational semantics. Here, +newer loaded entities takes priority (overwrites) over previous one. +Note that this is the reverse of convention, where +duplicated entity names are ignored. The priority order used here +is important to allow detecting cases where "constant" entities +are being overwriten, or if translatable entities are missing +translations. + +# Individual tracked entities, or `.xml` files at `entities/` + +As explained above, the individual entity contents are not really +valid XML *documents*, they are only at most valid XML *fragments*. + +Yet, individual entities are stored in entities/ as .xml files, for +two reasons: first, text editors in general can highlights XML syntax, +and second, this allows normal revision tracking on then, without +requiring weird changes on `revcheck.php`. + +# Bundled entities files, group tracked + +For very small textual entities, down to simple text words or single +tag elements, that may never change, individual entity tracking is +an overkill. This script also loads bundled entities files, at +some expected locations, with specific semantics. + +These bundle files are really normal XML files, correctly annotated +with XML namespaces used on manual, so any individual exported entity +have corret XML namespace annotations. These bundle entity files +are revcheck tracked normaly, but are not included in manual.xml.in, +as they only participate in general entity loading, described above. + +- global.ent - expected untranslated +- manual.ent - expected translated +- lang/entities/* - expected translated + +*/ + +ini_set( 'display_errors' , 1 ); +ini_set( 'display_startup_errors' , 1 ); +error_reporting( E_ALL ); + +const PARTIAL_IMPL = true; // For while spliting and bundle convertion are incomplete + +if ( count( $argv ) < 2 || in_array( '--help' , $argv ) || in_array( '-h' , $argv ) ) +{ + fwrite( STDERR , "\nUsage: {$argv[0]} [--debug] entitiesDir [entitiesDir]\n\n" ); + return; +} + +$filename = Entities::rotateOutputFile(); + +$langs = []; +$normal = true; // configure.php mode +$debug = false; // detailed output + +for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) + if ( $argv[$idx] == "--debug" ) + { + $normal = false; + $debug = true; + } + else + $langs[] = $argv[$idx]; + +if ( $normal ) + print "Creating .entities.ent..."; +else + print "Creating .entities.ent in debug mode.\n"; + +loadEnt( __DIR__ . "/../global.ent" , global: true , warnMissing: true ); +foreach( $langs as $lang ) +{ + loadEnt( __DIR__ . "/../../$lang/global.ent" , global: true ); + loadEnt( __DIR__ . "/../../$lang/manual.ent" , translate: true , warnMissing: true ); + loadEnt( __DIR__ . "/../../$lang/remove.ent" , remove: true ); + loadDir( $langs , $lang ); +} + +Entities::writeOutputFile(); +Entities::checkReplaces( $debug ); + +echo " done: " , Entities::$countTotalGenerated , " entities"; +if ( Entities::$countUnstranslated > 0 ) + echo ", " , Entities::$countUnstranslated , " untranslated"; +if ( Entities::$countConstantReplaced > 0 ) + echo ", " , Entities::$countConstantReplaced , " global replaced"; +if ( Entities::$countRemoveReplaced > 0 ) + echo ", " , Entities::$countRemoveReplaced , " to be removed"; +echo ".\n"; + +exit; + +class EntityData +{ + public function __construct( + public string $path , + public string $name , + public string $text ) {} +} + +class Entities +{ + public static int $countConstantReplaced = 0; + public static int $countUnstranslated = 0; + public static int $countRemoveReplaced = 0; + public static int $countTotalGenerated = 0; + + private static string $filename = __DIR__ . "/../.entities.ent"; // sibling of .manual.xml + + private static array $entities = []; // All entities, overwriten + private static array $global = []; // Entities from global.ent files + private static array $replace = []; // Entities expected replaced / translated + private static array $remove = []; // Entities expected removed + private static array $count = []; // Name / Count + private static array $slow = []; // External entities, slowless, overwrite + + static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) + { + $entity = new EntityData( $path , $name , $text ); + Entities::$entities[ $name ] = $entity; + + if ( $global ) + Entities::$global[ $name ] = $name; + + if ( $replace ) + Entities::$replace[ $name ] = $name; + + if ( $remove ) + Entities::$remove[ $name ] = $name; + + if ( ! isset( Entities::$count[$name] ) ) + Entities::$count[$name] = 1; + else + Entities::$count[$name]++; + } + + static function slow( string $path ) + { + if ( isset( $slow[$path] ) ) + fwrite( STDERR , "Unexpected physical file ovewrite: $path\n" ); + $slow[ $path ] = $path; + } + + static function rotateOutputFile() + { + if ( file_exists( Entities::$filename ) ) + unlink( Entities::$filename ); + touch( Entities::$filename ); + + Entities::$filename = realpath( Entities::$filename ); // only full paths on XML + } + + static function writeOutputFile() + { + saveEntitiesFile( Entities::$filename , Entities::$entities ); + } + + static function checkReplaces( bool $debug ) + { + Entities::$countTotalGenerated = count( Entities::$entities ); + Entities::$countConstantReplaced = 0; + Entities::$countUnstranslated = 0; + Entities::$countRemoveReplaced = 0; + + foreach( Entities::$entities as $name => $text ) + { + $replaced = Entities::$count[$name] - 1; + $expectedConstant = in_array( $name , Entities::$global ); + $expectedReplaced = in_array( $name , Entities::$replace ); + $expectedRemoved = in_array( $name , Entities::$remove ); + + if ( $expectedConstant && $replaced != 0 ) + { + Entities::$countConstantReplaced++; + if ( $debug ) + print "Expected global, replaced $replaced times:\t$name\n"; + } + + if ( $expectedReplaced && $replaced != 1 ) + { + Entities::$countUnstranslated++; + if ( $debug ) + print "Expected translated, replaced $replaced times:\t$name\n"; + } + + if ( $expectedRemoved && $replaced != 0 ) + { + Entities::$countRemoveReplaced++; + if ( $debug ) + print "Expected removed, replaced $replaced times:\t$name\n"; + } + } + } +} + +function loadEnt( string $path , bool $global = false , bool $translate = false , bool $remove = false , bool $warnMissing = false ) +{ + $absolute = realpath( $path ); + if ( $absolute === false ) + if ( PARTIAL_IMPL ) + return; + else + if ( $warnMissing ) + fwrite( STDERR , "\n Missing entity file: $path\n" ); + $path = $absolute; + + $text = file_get_contents( $path ); + $text = str_replace( "&" , "&" , $text ); + + $dom = new DOMDocument( '1.0' , 'utf8' ); + if ( ! $dom->loadXML( $text ) ) + die( "XML load failed for $path\n" ); + + $xpath = new DOMXPath( $dom ); + $list = $xpath->query( "/*/*" ); + + foreach( $list as $ent ) + { + // weird, namespace correting, DOMNodeList -> DOMDocumentFragment + $other = new DOMDocument( '1.0' , 'utf8' ); + + foreach( $ent->childNodes as $node ) + $other->appendChild( $other->importNode( $node , true ) ); + + $name = $ent->getAttribute( "name" ); + $text = $other->saveXML(); + + $text = str_replace( "&" , "&" , $text ); + $text = rtrim( $text , "\n" ); + $lines = explode( "\n" , $text ); + array_shift( $lines ); // remove XML declaration + $text = implode( "\n" , $lines ); + + Entities::put( $path , $name , $text , $global , $translate , $remove ); + } +} + +function loadDir( array $langs , string $lang ) +{ + global $debug; + + $dir = __DIR__ . "/../../$lang/entities"; + $dir = realpath( $dir ); + if ( $dir === false || ! is_dir( $dir ) ) + if ( PARTIAL_IMPL ) + { + if ( $debug ) + print "Not a directory: $dir\n"; + return; + } + else + exit( "Not directory: $dir\n" ); + + $files = scandir( $dir ); + $expectedReplaced = array_search( $lang , $langs ) > 0; + + foreach( $files as $file ) + { + $path = realpath( "$dir/$file" ); + + if ( is_dir( $path ) ) + continue; + if ( str_starts_with( $file , '.' ) ) + continue; + + $text = file_get_contents( $path ); + $text = rtrim( $text , "\n" ); + + loadXml( $path , $text , $expectedReplaced ); + } +} + +function loadXml( string $path , string $text , bool $expectedReplaced ) +{ + if ( trim( $text ) == "" ) + { + fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); + Entities::put( $pat , $text , remove: true ); + return; + } + + $info = pathinfo( $path ); + $name = $info["filename"]; + + $frag = "$text"; + + $dom = new DOMDocument( '1.0' , 'utf8' ); + $dom->recover = true; + $dom->resolveExternals = false; + libxml_use_internal_errors( true ); + + $res = $dom->loadXML( $frag ); + + $err = libxml_get_errors(); + libxml_clear_errors(); + + foreach( $err as $item ) + { + $msg = trim( $item->message ); + if ( str_starts_with( $msg , "Entity '" ) && str_ends_with( $msg , "' not defined" ) ) + continue; + + fwrite( STDERR , "\n XML load failed on entity file." ); + fwrite( STDERR , "\n Path: $path" ); + fwrite( STDERR , "\n Error: $msg\n" ); + return; + } + + Entities::put( $path , $name , $text , replace: $expectedReplaced ); +} + +function saveEntitiesFile( string $filename , array $entities ) +{ + $tmpDir = __DIR__ . "/entities"; + + $file = fopen( $filename , "w" ); + fputs( $file , "\n\n\n" ); + + foreach( $entities as $name => $entity ) + { + $text = $entity->text; + $quote = ""; + + // If the text contains mixed quoting, keeping it + // as an external file to avoid (re)quotation hell. + + if ( strpos( $text , "'" ) === false ) + $quote = "'"; + if ( strpos( $text , '"' ) === false ) + $quote = '"'; + + if ( $quote == "" ) + { + if ( $entity->path == "" ) + { + $entity->path = $tmpDir . "/{$entity->path}.tmp"; + file_put_contents( $entity->path , $text ); + } + fputs( $file , "path}'>\n\n" ); + Entities::slow( $entity->path ); + } + else + fputs( $file , "\n\n" ); + } + + fclose( $file ); +} From fc3772cefdfed569d3f3f738a8281d4c10fc432f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 13:19:06 -0300 Subject: [PATCH 02/11] Fixes on conv/split tools --- scripts/dtdent-conv.php | 8 +++---- scripts/dtdent-split.php | 50 ++++++++++++++++++++++++++++++---------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/scripts/dtdent-conv.php b/scripts/dtdent-conv.php index 2b1dec2061..777a2cb2d8 100644 --- a/scripts/dtdent-conv.php +++ b/scripts/dtdent-conv.php @@ -12,10 +12,10 @@ +----------------------------------------------------------------------+ | Authors: André L F S Bacci | +----------------------------------------------------------------------+ -| Description: Convert old style .ent into new style .ent XML bundle. | +| Description: Convert DTD Entities files into XML Entities files. | +----------------------------------------------------------------------+ -See `entities.php` source for detailed rationale. +See `entities.php` for detailed rationale. Use this for converting bundled entities files that use into XML version used by `entities.php`. @@ -58,7 +58,7 @@ $name = substr( $content , $pos1 , $pos2 - $pos1 - 1 ); $text = substr( $content , $pos2 , $pos3 - $pos2 ); - // weird &ugly; ass, namespace corret, DOMDocumentFragment -> DOMNodeList (ampunstand intended) + // weird &ugly; ass, namespace correct, DOMDocumentFragment -> DOMNodeList (ampunstand intended) $name = trim( $name ); $text = str_replace( "&" , "&" , $text ); @@ -80,5 +80,5 @@ $text = $dom->saveXML( $dom->getElementsByTagName( "entity" )[0] ); $text = str_replace( "&" , "&" , $text ); - echo "$text\n"; + echo "\n$text\n"; } diff --git a/scripts/dtdent-split.php b/scripts/dtdent-split.php index 168e5aa890..d5d684c446 100644 --- a/scripts/dtdent-split.php +++ b/scripts/dtdent-split.php @@ -12,26 +12,26 @@ +----------------------------------------------------------------------+ | Authors: André L F S Bacci | +----------------------------------------------------------------------+ -| Description: Split old style .ent file into individual files. | +| Description: Split old DTD .ent file into individual XML files. | +----------------------------------------------------------------------+ -See `entities.php` source for detailed rationale. +See `entities.php` for detailed rationale. -Use this for spliting `language-snippets-ent` or other "big" entities -files into individual .xml files. +Use this for spliting `language-snippets-ent` and possible other DTD +entities files into individual .xml files. -After spliting, add the new directory entities/ with they contents, -and remove `language-snippets-ent`, in one go. +After spliting, add generated files under doc-lang/entities/ , and +the original file, in one go. -After all old style .ent files are split or converted, this script can +After all DTD .ent files are split or converted, this script can be removed. */ ini_set( 'display_errors' , 1 ); ini_set( 'display_startup_errors' , 1 ); error_reporting( E_ALL ); -if ( count( $argv ) < 4 ) - die(" Syntax: php $argv[0] infile outdir [hash user]\n" ); +if ( count( $argv ) < 3 ) + die(" Syntax: php $argv[0] infile outdir [hash user]\n" ); $infile = $argv[1]; $outdir = $argv[2]; @@ -75,7 +75,7 @@ { $file = "$outdir/$name.xml"; if ( file_exists( $file ) ) - exit( "Name colision: $file\n" ); + echo( "Entity name colision, OVERWROTE: $file\n" ); } // Write @@ -83,8 +83,7 @@ foreach( $entities as $name => $text ) { $file = "$outdir/$name.xml"; - - $header = '' . "\n"; + $header = ""; if ( $hash != "" ) $header .= "\n"; @@ -92,5 +91,32 @@ file_put_contents( $file , $header . $text ); } +// Test + +$dom = new DOMDocument(); +$dom->recover = true; +$dom->resolveExternals = false; +libxml_use_internal_errors( true ); + +foreach( $entities as $name => $text ) +{ + $file = "$outdir/$name.xml"; + + $text = file_get_contents( $file ); + $text = "$text"; + + $dom->loadXML( $text ); + $err = libxml_get_errors(); + libxml_clear_errors(); + + foreach( $err as $e ) + { + $msg = trim( $e->message ); + if ( str_starts_with( $msg , "Entity '" ) && str_ends_with( $msg , "' not defined" ) ) + continue; + die( "Failed to load $file\n" ); + } +} + $total = count( $entities ); print "Generated $total files.\n"; From b31590c79c757421c17ddb593a91ab82382c566b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 14:35:12 -0300 Subject: [PATCH 03/11] Example (dist) files, idempotent build and other fixes --- entities/global.ent-dist | 19 +++++++ entities/manual.ent-dist | 14 +++++ entities/remove.ent-dist | 20 +++++++ scripts/entities.php | 116 ++++++++++++++++++++------------------- 4 files changed, 112 insertions(+), 57 deletions(-) create mode 100644 entities/global.ent-dist create mode 100644 entities/manual.ent-dist create mode 100644 entities/remove.ent-dist diff --git a/entities/global.ent-dist b/entities/global.ent-dist new file mode 100644 index 0000000000..1d5c90fe82 --- /dev/null +++ b/entities/global.ent-dist @@ -0,0 +1,19 @@ + + + + + + + \ No newline at end of file diff --git a/entities/manual.ent-dist b/entities/manual.ent-dist new file mode 100644 index 0000000000..62ca585503 --- /dev/null +++ b/entities/manual.ent-dist @@ -0,0 +1,14 @@ + + + + + + + + \ No newline at end of file diff --git a/entities/remove.ent-dist b/entities/remove.ent-dist new file mode 100644 index 0000000000..6bf8988ad5 --- /dev/null +++ b/entities/remove.ent-dist @@ -0,0 +1,20 @@ + + + + + + + \ No newline at end of file diff --git a/scripts/entities.php b/scripts/entities.php index 3305f6a8d6..de090dcae7 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -17,12 +17,12 @@ # Mental model, or things that I would liked to know 20 years prior -XML Entity processing has more in common with DOMDocumentFragment than +DTD Entity processing has more in common with DOMDocumentFragment than DOMElement. In other words, simple text and multi rooted XML files are valid contents, whereas they are not valid XML documents. Also, namespaces do not automatically "cross" between a parent -document and their includes, even if they are included in the same +document and their entities, even if they are included in the same file, as local textual entities. s are, for all intended purposes, separated documents, with separated namespaces and have *expected* different default namespaces. @@ -36,11 +36,11 @@ # Output -This script collects bundled and individual entity files (detailed +This script collects grouped and individual entity files (detailed below), at some expected relative paths, and generates an .entities.ent file, in a sibling position to manual.xml.in. -The output .entities.ent file has no duplications, so collection +The output file .entities.ent has no duplications, so collection order is important to keep the necessary operational semantics. Here, newer loaded entities takes priority (overwrites) over previous one. Note that this is the reverse of convention, where @@ -49,52 +49,56 @@ are being overwriten, or if translatable entities are missing translations. -# Individual tracked entities, or `.xml` files at `entities/` +# Individual XML Entities, or `.xml` files at `entities/` As explained above, the individual entity contents are not really valid XML *documents*, they are only at most valid XML *fragments*. Yet, individual entities are stored in entities/ as .xml files, for two reasons: first, text editors in general can highlights XML syntax, -and second, this allows normal revision tracking on then, without -requiring weird changes on `revcheck.php`. +even for XML fragments, and second, this allows normal revision tracking +per file, without requiring weird changes on `revcheck.php`. Note that +is *invalid* to place XML declaration in these fragment files, at least +in files that are invalid XML documents (on multi node rooted ones). -# Bundled entities files, group tracked +# Grouped entities files, file tracked For very small textual entities, down to simple text words or single tag elements, that may never change, individual entity tracking is -an overkill. This script also loads bundled entities files, at +an overkill. This script also loads grouped XML Entities files, at some expected locations, with specific semantics. -These bundle files are really normal XML files, correctly annotated +These grouped files are really normal XML files, correctly annotated with XML namespaces used on manual, so any individual exported entity -have corret XML namespace annotations. These bundle entity files -are revcheck tracked normaly, but are not included in manual.xml.in, -as they only participate in general entity loading, described above. +have correct anc clean XML namespace annotations. These grouped entity +files are tracked normally by revcheck, but are not directly included +in manual.xml.in, as they only participate in general entity loading, +described above. -- global.ent - expected untranslated -- manual.ent - expected translated -- lang/entities/* - expected translated +- global.ent - expected unreplaced +- manual.ent - expected replaced (translated) +- remove.ent - expected unused +- lang/entities/* - expected replaced (translated) */ +const PARTIAL_IMPL = true; // For while XML Entities are not fully implanted in all languages + ini_set( 'display_errors' , 1 ); ini_set( 'display_startup_errors' , 1 ); error_reporting( E_ALL ); -const PARTIAL_IMPL = true; // For while spliting and bundle convertion are incomplete - if ( count( $argv ) < 2 || in_array( '--help' , $argv ) || in_array( '-h' , $argv ) ) { - fwrite( STDERR , "\nUsage: {$argv[0]} [--debug] entitiesDir [entitiesDir]\n\n" ); + fwrite( STDERR , "\nUsage: {$argv[0]} [--debug] langCode [langCode]\n\n" ); return; } -$filename = Entities::rotateOutputFile(); +$filename = Entities::rotateOutputFile(); // idempotent $langs = []; -$normal = true; // configure.php mode -$debug = false; // detailed output +$normal = true; // Normal configure.php mode +$debug = false; // Detailed console mode for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) if ( $argv[$idx] == "--debug" ) @@ -125,10 +129,10 @@ echo " done: " , Entities::$countTotalGenerated , " entities"; if ( Entities::$countUnstranslated > 0 ) echo ", " , Entities::$countUnstranslated , " untranslated"; -if ( Entities::$countConstantReplaced > 0 ) - echo ", " , Entities::$countConstantReplaced , " global replaced"; -if ( Entities::$countRemoveReplaced > 0 ) - echo ", " , Entities::$countRemoveReplaced , " to be removed"; +if ( Entities::$countReplacedGlobal > 0 ) + echo ", " , Entities::$countReplacedGlobal , " global replaced"; +if ( Entities::$countReplacedRemove > 0 ) + echo ", " , Entities::$countReplacedRemove , " remove replaced"; echo ".\n"; exit; @@ -143,19 +147,19 @@ public function __construct( class Entities { - public static int $countConstantReplaced = 0; public static int $countUnstranslated = 0; - public static int $countRemoveReplaced = 0; + public static int $countReplacedGlobal = 0; + public static int $countReplacedRemove = 0; public static int $countTotalGenerated = 0; - private static string $filename = __DIR__ . "/../.entities.ent"; // sibling of .manual.xml + private static string $filename = __DIR__ . "/../temp/entities.ent"; // idempotent private static array $entities = []; // All entities, overwriten - private static array $global = []; // Entities from global.ent files + private static array $global = []; // Entities expected not replaced private static array $replace = []; // Entities expected replaced / translated - private static array $remove = []; // Entities expected removed + private static array $remove = []; // Entities expected not replaced and not used private static array $count = []; // Name / Count - private static array $slow = []; // External entities, slowless, overwrite + private static array $slow = []; // External entities, slow, uncontroled overwrite static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) { @@ -189,7 +193,6 @@ static function rotateOutputFile() if ( file_exists( Entities::$filename ) ) unlink( Entities::$filename ); touch( Entities::$filename ); - Entities::$filename = realpath( Entities::$filename ); // only full paths on XML } @@ -201,36 +204,36 @@ static function writeOutputFile() static function checkReplaces( bool $debug ) { Entities::$countTotalGenerated = count( Entities::$entities ); - Entities::$countConstantReplaced = 0; Entities::$countUnstranslated = 0; - Entities::$countRemoveReplaced = 0; + Entities::$countReplacedGlobal = 0; + Entities::$countReplacedRemove = 0; foreach( Entities::$entities as $name => $text ) { $replaced = Entities::$count[$name] - 1; - $expectedConstant = in_array( $name , Entities::$global ); + $expectedGlobal = in_array( $name , Entities::$global ); $expectedReplaced = in_array( $name , Entities::$replace ); $expectedRemoved = in_array( $name , Entities::$remove ); - if ( $expectedConstant && $replaced != 0 ) + if ( $expectedGlobal && $replaced != 0 ) { - Entities::$countConstantReplaced++; + Entities::$countReplacedGlobal++; if ( $debug ) - print "Expected global, replaced $replaced times:\t$name\n"; + print "Expected global, replaced $replaced times: $name\n"; } if ( $expectedReplaced && $replaced != 1 ) { Entities::$countUnstranslated++; if ( $debug ) - print "Expected translated, replaced $replaced times:\t$name\n"; + print "Expected translated, replaced $replaced times: $name\n"; } if ( $expectedRemoved && $replaced != 0 ) { - Entities::$countRemoveReplaced++; + Entities::$countReplacedRemove++; if ( $debug ) - print "Expected removed, replaced $replaced times:\t$name\n"; + print "Expected removed, replaced $replaced times: $name\n"; } } } @@ -238,14 +241,14 @@ static function checkReplaces( bool $debug ) function loadEnt( string $path , bool $global = false , bool $translate = false , bool $remove = false , bool $warnMissing = false ) { - $absolute = realpath( $path ); - if ( $absolute === false ) + $realpath = realpath( $path ); + if ( $realpath === false ) if ( PARTIAL_IMPL ) return; else if ( $warnMissing ) fwrite( STDERR , "\n Missing entity file: $path\n" ); - $path = $absolute; + $path = $realpath; $text = file_get_contents( $path ); $text = str_replace( "&" , "&" , $text ); @@ -259,7 +262,7 @@ function loadEnt( string $path , bool $global = false , bool $translate = false foreach( $list as $ent ) { - // weird, namespace correting, DOMNodeList -> DOMDocumentFragment + // weird, namespace correting, DOMNodeList -> DOMDocumentFragment transform $other = new DOMDocument( '1.0' , 'utf8' ); foreach( $ent->childNodes as $node ) @@ -268,8 +271,8 @@ function loadEnt( string $path , bool $global = false , bool $translate = false $name = $ent->getAttribute( "name" ); $text = $other->saveXML(); - $text = str_replace( "&" , "&" , $text ); $text = rtrim( $text , "\n" ); + $text = str_replace( "&" , "&" , $text ); $lines = explode( "\n" , $text ); array_shift( $lines ); // remove XML declaration $text = implode( "\n" , $lines ); @@ -292,7 +295,7 @@ function loadDir( array $langs , string $lang ) return; } else - exit( "Not directory: $dir\n" ); + exit( "Error: not a directory: $dir\n" ); $files = scandir( $dir ); $expectedReplaced = array_search( $lang , $langs ) > 0; @@ -301,10 +304,10 @@ function loadDir( array $langs , string $lang ) { $path = realpath( "$dir/$file" ); - if ( is_dir( $path ) ) - continue; if ( str_starts_with( $file , '.' ) ) continue; + if ( is_dir( $path ) ) + continue; $text = file_get_contents( $path ); $text = rtrim( $text , "\n" ); @@ -315,18 +318,17 @@ function loadDir( array $langs , string $lang ) function loadXml( string $path , string $text , bool $expectedReplaced ) { + $info = pathinfo( $path ); + $name = $info["filename"]; + $frag = "$text"; + if ( trim( $text ) == "" ) { fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); - Entities::put( $pat , $text , remove: true ); + Entities::put( $path , $name , $text ); return; } - $info = pathinfo( $path ); - $name = $info["filename"]; - - $frag = "$text"; - $dom = new DOMDocument( '1.0' , 'utf8' ); $dom->recover = true; $dom->resolveExternals = false; @@ -354,7 +356,7 @@ function loadXml( string $path , string $text , bool $expectedReplaced ) function saveEntitiesFile( string $filename , array $entities ) { - $tmpDir = __DIR__ . "/entities"; + $tmpDir = __DIR__ . "/temp"; // idempotent $file = fopen( $filename , "w" ); fputs( $file , "\n\n\n" ); From e4f12f53f47991dbfd32c1635469c5d187b10443 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Fri, 6 Dec 2024 15:24:10 -0300 Subject: [PATCH 04/11] Idempotence and opt-in / partial implantation --- scripts/entities.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/entities.php b/scripts/entities.php index de090dcae7..08544cbd6f 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -324,7 +324,8 @@ function loadXml( string $path , string $text , bool $expectedReplaced ) if ( trim( $text ) == "" ) { - fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); + if ( ! PARTIAL_IMPL ) + fwrite( STDERR , "\n Empty entity (should it be in remove.ent?): '$path' \n" ); Entities::put( $path , $name , $text ); return; } From 4ec34eaf3835acabf348d3edaac67d7fc7db2493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Mon, 9 Dec 2024 09:01:18 -0300 Subject: [PATCH 05/11] Improve comment texts --- scripts/entities.php | 49 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/scripts/entities.php b/scripts/entities.php index 08544cbd6f..d9402bb666 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -36,41 +36,42 @@ # Output -This script collects grouped and individual entity files (detailed -below), at some expected relative paths, and generates an -.entities.ent file, in a sibling position to manual.xml.in. - -The output file .entities.ent has no duplications, so collection -order is important to keep the necessary operational semantics. Here, -newer loaded entities takes priority (overwrites) over previous one. -Note that this is the reverse of convention, where -duplicated entity names are ignored. The priority order used here -is important to allow detecting cases where "constant" entities -are being overwriten, or if translatable entities are missing -translations. +This script collects grouped and individual XML Entity files +(detailed below), at some expected relative paths, and generates an +doc-base/temp/entities.ent file with their respective DTD Entities. + +The output file has no duplications, so collection order is important +to keep the necessary operational semantics. Here, latter loaded entities +takes priority (overrides) an previous defined one. Note that this is the +reverse of DTD convention, where duplicated entity names are +ignored. The priority order used here is important to allow detecting +cases where global entities are being overwritten, or if expected +translatable entities are missing translations. # Individual XML Entities, or `.xml` files at `entities/` As explained above, the individual entity contents are not really valid XML *documents*, they are only at most valid XML *fragments*. +More technically, these XML files are really well-balanced texts, per +https://www.w3.org/TR/xml-fragment/#defn-well-balanced . Yet, individual entities are stored in entities/ as .xml files, for -two reasons: first, text editors in general can highlights XML syntax, -even for XML fragments, and second, this allows normal revision tracking +two reasons: first, text editors in general can highlights XML syntax in +well-balanced texts; and second, this allows normal revision tracking per file, without requiring weird changes on `revcheck.php`. Note that is *invalid* to place XML declaration in these fragment files, at least -in files that are invalid XML documents (on multi node rooted ones). +in files that are invalid XML documents (on multi-node rooted ones). # Grouped entities files, file tracked For very small textual entities, down to simple text words or single -tag elements, that may never change, individual entity tracking is +tag elements that may never change, individual entity tracking is an overkill. This script also loads grouped XML Entities files, at some expected locations, with specific semantics. These grouped files are really normal XML files, correctly annotated -with XML namespaces used on manual, so any individual exported entity -have correct anc clean XML namespace annotations. These grouped entity +with XML namespaces used on manuals, so any individual exported entity +has correct and clean XML namespace annotations. These grouped entity files are tracked normally by revcheck, but are not directly included in manual.xml.in, as they only participate in general entity loading, described above. @@ -98,17 +99,15 @@ $langs = []; $normal = true; // Normal configure.php mode -$debug = false; // Detailed console mode for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) if ( $argv[$idx] == "--debug" ) - { $normal = false; - $debug = true; - } else $langs[] = $argv[$idx]; +$debug = ! $normal; + if ( $normal ) print "Creating .entities.ent..."; else @@ -154,12 +153,12 @@ class Entities private static string $filename = __DIR__ . "/../temp/entities.ent"; // idempotent - private static array $entities = []; // All entities, overwriten + private static array $entities = []; // All entities, bi duplications private static array $global = []; // Entities expected not replaced private static array $replace = []; // Entities expected replaced / translated private static array $remove = []; // Entities expected not replaced and not used private static array $count = []; // Name / Count - private static array $slow = []; // External entities, slow, uncontroled overwrite + private static array $slow = []; // External entities, slow, uncontrolled file overwrites static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) { @@ -184,7 +183,7 @@ static function put( string $path , string $name , string $text , bool $global = static function slow( string $path ) { if ( isset( $slow[$path] ) ) - fwrite( STDERR , "Unexpected physical file ovewrite: $path\n" ); + fwrite( STDERR , "Unexpected file overwrite: $path\n" ); $slow[ $path ] = $path; } From 3aec0af0e5b2dd30f6624e27516e99b938c1f178 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Mon, 9 Dec 2024 10:52:34 -0300 Subject: [PATCH 06/11] Detect duplicated entity names on first language --- scripts/entities.php | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/scripts/entities.php b/scripts/entities.php index d9402bb666..3e9fcb86ba 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -98,20 +98,21 @@ $filename = Entities::rotateOutputFile(); // idempotent $langs = []; -$normal = true; // Normal configure.php mode +$normal = true; +$debug = false; for( $idx = 1 ; $idx < count( $argv ) ; $idx++ ) if ( $argv[$idx] == "--debug" ) $normal = false; else $langs[] = $argv[$idx]; - $debug = ! $normal; if ( $normal ) print "Creating .entities.ent..."; else print "Creating .entities.ent in debug mode.\n"; +$debug = ! $normal; loadEnt( __DIR__ . "/../global.ent" , global: true , warnMissing: true ); foreach( $langs as $lang ) @@ -120,6 +121,7 @@ loadEnt( __DIR__ . "/../../$lang/manual.ent" , translate: true , warnMissing: true ); loadEnt( __DIR__ . "/../../$lang/remove.ent" , remove: true ); loadDir( $langs , $lang ); + Entities::$debugUnique = false; } Entities::writeOutputFile(); @@ -132,6 +134,8 @@ echo ", " , Entities::$countReplacedGlobal , " global replaced"; if ( Entities::$countReplacedRemove > 0 ) echo ", " , Entities::$countReplacedRemove , " remove replaced"; +if ( Entities::$countDuplicated > 0 ) + echo ", " , Entities::$countDuplicated , " duplicated (first language)"; echo ".\n"; exit; @@ -146,20 +150,24 @@ public function __construct( class Entities { - public static int $countUnstranslated = 0; - public static int $countReplacedGlobal = 0; - public static int $countReplacedRemove = 0; - public static int $countTotalGenerated = 0; - private static string $filename = __DIR__ . "/../temp/entities.ent"; // idempotent private static array $entities = []; // All entities, bi duplications private static array $global = []; // Entities expected not replaced private static array $replace = []; // Entities expected replaced / translated private static array $remove = []; // Entities expected not replaced and not used + private static array $unique = []; // For detecting duplicated global+en entities private static array $count = []; // Name / Count private static array $slow = []; // External entities, slow, uncontrolled file overwrites + public static bool $debugUnique = true; // Start on unique mode, disable on second language + + public static int $countUnstranslated = 0; + public static int $countReplacedGlobal = 0; + public static int $countReplacedRemove = 0; + public static int $countTotalGenerated = 0; + public static int $countDuplicated = 0; + static function put( string $path , string $name , string $text , bool $global = false , bool $replace = false , bool $remove = false ) { $entity = new EntityData( $path , $name , $text ); @@ -174,10 +182,22 @@ static function put( string $path , string $name , string $text , bool $global = if ( $remove ) Entities::$remove[ $name ] = $name; - if ( ! isset( Entities::$count[$name] ) ) + if ( ! isset( Entities::$count[ $name ] ) ) Entities::$count[$name] = 1; else Entities::$count[$name]++; + + if ( Entities::$debugUnique ) + { + if ( isset( Entities::$unique[ $name ] ) ) + { + Entities::$countDuplicated++; + if ( Entities::$countDuplicated == 1 ) + fwrite( STDERR , "\n" ); + fwrite( STDERR , "\n Duplicated entity: $name\n" ); + } + Entities::$unique[ $name ] = $entity; + } } static function slow( string $path ) From 458f944d6569bdd8e8bd2a3fb352a617f28c02c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Mon, 9 Dec 2024 11:04:16 -0300 Subject: [PATCH 07/11] Align output --- scripts/entities.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/entities.php b/scripts/entities.php index 3e9fcb86ba..2e927c8cd7 100644 --- a/scripts/entities.php +++ b/scripts/entities.php @@ -193,8 +193,8 @@ static function put( string $path , string $name , string $text , bool $global = { Entities::$countDuplicated++; if ( Entities::$countDuplicated == 1 ) - fwrite( STDERR , "\n" ); - fwrite( STDERR , "\n Duplicated entity: $name\n" ); + fwrite( STDERR , "\n\n" ); + fwrite( STDERR , " Duplicated entity: $name\n" ); } Entities::$unique[ $name ] = $entity; } From 63b02f9cd83a6fd0c6c88bbdc621fd4f6f69192f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20L=20F=20S=20Bacci?= Date: Mon, 9 Dec 2024 11:16:32 -0300 Subject: [PATCH 08/11] Reserve space for revision on original files --- scripts/dtdent-split.php | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/dtdent-split.php b/scripts/dtdent-split.php index d5d684c446..d23863b315 100644 --- a/scripts/dtdent-split.php +++ b/scripts/dtdent-split.php @@ -83,9 +83,10 @@ foreach( $entities as $name => $text ) { $file = "$outdir/$name.xml"; - $header = ""; - if ( $hash != "" ) + if ( $hash == "" ) + $header = ''; + else $header .= "\n"; file_put_contents( $file , $header . $text ); From 7126963cb238fae8c8d50ee6a5461455b25973a2 Mon Sep 17 00:00:00 2001 From: Jordi Kroon Date: Sat, 13 Jun 2026 19:39:23 +0200 Subject: [PATCH 09/11] re-add entities to manual.xml that went lost in the rebase --- manual.xml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/manual.xml b/manual.xml index 1a2b6fc947..904c0455b8 100644 --- a/manual.xml +++ b/manual.xml @@ -5,6 +5,10 @@ %configure; + + +%manual-entities; + %translation-defs; %translation-snippets; @@ -51,7 +55,6 @@ &install.cloud.index; &install.fpm.index; &install.pecl; - &install.composer; &install.pie; &install.ini; From 215f263eebc6e8215ad283a14f2ec9650758566b Mon Sep 17 00:00:00 2001 From: Jordi Kroon Date: Sat, 13 Jun 2026 19:41:05 +0200 Subject: [PATCH 10/11] add newline at eof --- entities/global.ent-dist | 2 +- entities/manual.ent-dist | 2 +- entities/remove.ent-dist | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/entities/global.ent-dist b/entities/global.ent-dist index 1d5c90fe82..a453871ca3 100644 --- a/entities/global.ent-dist +++ b/entities/global.ent-dist @@ -16,4 +16,4 @@ - \ No newline at end of file + diff --git a/entities/manual.ent-dist b/entities/manual.ent-dist index 62ca585503..d93f720ded 100644 --- a/entities/manual.ent-dist +++ b/entities/manual.ent-dist @@ -11,4 +11,4 @@ - \ No newline at end of file + diff --git a/entities/remove.ent-dist b/entities/remove.ent-dist index 6bf8988ad5..18ae9e6288 100644 --- a/entities/remove.ent-dist +++ b/entities/remove.ent-dist @@ -17,4 +17,4 @@ - \ No newline at end of file + From ab0fd162098fbbece8a5b037338fa7c487ca6836 Mon Sep 17 00:00:00 2001 From: Jordi Kroon Date: Sat, 13 Jun 2026 19:42:34 +0200 Subject: [PATCH 11/11] re-add &install.composer; --- manual.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/manual.xml b/manual.xml index 904c0455b8..a5899fcd26 100644 --- a/manual.xml +++ b/manual.xml @@ -55,6 +55,7 @@ &install.cloud.index; &install.fpm.index; &install.pecl; + &install.composer; &install.pie; &install.ini;