From 4aef3ca47e9407dae58e4170764ad651f46b1801 Mon Sep 17 00:00:00 2001 From: Merith Date: Tue, 15 Jul 2025 18:53:50 -0700 Subject: [PATCH] snapshot commit of tool --- .tools/mw2md/.gitignore | 3 + .tools/mw2md/Dockerfile | 49 +++++---- .tools/mw2md/README.md | 100 ++++++++++++++++++ .tools/mw2md/composer.json | 5 + .tools/mw2md/convert.php | 205 +++++++++++++++++++++++++++++++++++++ .tools/mw2md/convert.ps1 | 40 ++++++++ 6 files changed, 377 insertions(+), 25 deletions(-) create mode 100644 .tools/mw2md/.gitignore create mode 100644 .tools/mw2md/README.md create mode 100644 .tools/mw2md/composer.json create mode 100644 .tools/mw2md/convert.php create mode 100644 .tools/mw2md/convert.ps1 diff --git a/.tools/mw2md/.gitignore b/.tools/mw2md/.gitignore new file mode 100644 index 0000000..3c520fa --- /dev/null +++ b/.tools/mw2md/.gitignore @@ -0,0 +1,3 @@ +composer.phar +composer.lock +vendor \ No newline at end of file diff --git a/.tools/mw2md/Dockerfile b/.tools/mw2md/Dockerfile index fb36186..e3a7176 100644 --- a/.tools/mw2md/Dockerfile +++ b/.tools/mw2md/Dockerfile @@ -1,26 +1,25 @@ -## CREDIT https://github.com/realrubberduckdev/mediawiki-to-markdown/ -FROM pandoc/latex -WORKDIR /src -COPY composer.* ./ - -# Install composer, refer: https://github.com/geshan/docker-php-composer-alpine/blob/master/Dockerfile -RUN apk --update add wget \ - curl \ - git \ - php7 \ - php7-curl \ - php7-openssl \ - php7-iconv \ - php7-json \ - php7-mbstring \ - php7-phar \ - php7-xml \ - php7-simplexml \ - php7-dom --repository http://nl.alpinelinux.org/alpine/edge/testing/ && rm /var/cache/apk/* - -RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/bin --filename=composer -# end of install composer - -RUN composer install -COPY . . +FROM pandoc/latex +WORKDIR /src +COPY composer.* ./ + +# Install composer, refer: https://github.com/geshan/docker-php-composer-alpine/blob/master/Dockerfile +RUN apk --update add wget \ + curl \ + git \ + php7 \ + php7-curl \ + php7-openssl \ + php7-iconv \ + php7-json \ + php7-mbstring \ + php7-phar \ + php7-xml \ + php7-simplexml \ + php7-dom --repository http://nl.alpinelinux.org/alpine/edge/testing/ && rm /var/cache/apk/* + +RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/bin --filename=composer +# end of install composer + +RUN composer install +COPY . . CMD ["sh"] \ No newline at end of file diff --git a/.tools/mw2md/README.md b/.tools/mw2md/README.md new file mode 100644 index 0000000..d8b2d98 --- /dev/null +++ b/.tools/mw2md/README.md @@ -0,0 +1,100 @@ +MediaWiki to Markdown +===================== + +> Snapshot of https://github.com/realrubberduckdev/mediawiki-to-markdown/ +> commit hash `4ebf945e68984270c820e8fe6a892e0acfc6875d` + +Convert MediaWiki pages to GitHub flavored Markdown (or other formats supported by Pandoc). The conversion uses an XML export from MediaWiki and converts each wiki page to an individual markdown file. Directory structures will be preserved. The generated export can also include frontmatter for Github pages. + +You may also be interested in a forked version of this codebase available at https://github.com/philipashlock/mediawiki-to-markdown + +You may also be interested in a forked version of this codebase available at https://github.com/outofcontrol/mediawiki-to-gfm + + +## Requirements + +* Docker +* Powershell + + +## Export MediaWiki Pages + +You'll export all your pages as a single XML file following these steps: http://en.wikipedia.org/wiki/Help:Export + + +## Run + +The simplest way to run is using the `convert.ps1` script. + +`.\convert.ps1 -convertFileArgFullPath C:\wiki.xml` + +### Further granular run parameters + +In order to use any other options, you will have update the `$dockerRunCmd` variable in `convert.ps1` script. The available options are below. + +####--filename#### +The only required parameter is `filename` for the name of the xml file you exported from MediaWiki, eg: + +`php convert.php --filename=mediawiki.xml` + +####--output#### +You can also use `output` to specify an output folder since each wiki page in the XML file will generate it's own separate markdown file. + +`php convert.php --filename=mediawiki.xml --output=export` + + +####--indexes#### +You can set `indexes` as `true` if you want pages with the same name as a directory to be renamed as index.md and placed into their directory + +`php convert.php --filename=mediawiki.xml --output=export --indexes=true` + +####--frontmatter#### +You can specify whether you want frontmatter included. This is automatically set to `true` when the output format is `markdown_github` + +`php convert.php --filename=mediawiki.xml --output=export --format=markdown_phpextra --frontmatter=true` + + +####--format#### +You can specify different output formats with `format`. The default is `markdown_github`. See + +`php convert.php --filename=mediawiki.xml --output=export --format=markdown_phpextra` + +Supported pandoc formats are: + +* asciidoc +* beamer +* context +* docbook +* docx +* dokuwiki +* dzslides +* epub +* epub3 +* fb2 +* haddock +* html +* html5 +* icml +* json +* latex +* man +* markdown +* markdown_github +* markdown_mmd +* markdown_phpextra +* markdown_strict +* mediawiki +* native +* odt +* opendocument +* opml +* org +* plain +* revealjs +* rst +* rtf +* s5 +* slideous +* slidy +* texinfo +* textile diff --git a/.tools/mw2md/composer.json b/.tools/mw2md/composer.json new file mode 100644 index 0000000..03e4fa1 --- /dev/null +++ b/.tools/mw2md/composer.json @@ -0,0 +1,5 @@ +{ + "require": { + "ryakad/pandoc-php": "dev-master" + } +} \ No newline at end of file diff --git a/.tools/mw2md/convert.php b/.tools/mw2md/convert.php new file mode 100644 index 0000000..2cfc353 --- /dev/null +++ b/.tools/mw2md/convert.php @@ -0,0 +1,205 @@ + +xpath('page'); +$count = 0; +$directory_list = array(); + +// Iterate through XML +while(list( , $node) = each($result)) { + + $title = $node->xpath('title'); + $title = $title[0]; + $url = str_replace(' ', '_', $title); + + if($slash = strpos($url, '/')){ + $title = str_replace('/', ' ', $title); + $directory = substr($url, 0, $slash); + $filename = substr($url, $slash+1); + $directory_list[$directory] = true; + } else { + $directory = ''; + $filename = $url; + } + + $text = $node->xpath('revision/text'); + $text = $text[0]; + $text = html_entity_decode($text); // decode inline html + $text = preg_replace_callback('/\[\[(.+?)\]\]/', "new_link", $text); // adds leading slash to links, "absolute-path reference" + + // prepare to append page title frontmatter to text + if ($add_meta) { + $frontmatter = "---\n"; + $frontmatter .= "title: $title\n"; + $frontmatter .= "permalink: /$url/\n"; + $frontmatter .= "---\n\n"; + } + + $pandoc = new Pandoc\Pandoc(); + $options = array( + "from" => "mediawiki", + "to" => $format + ); + $text = $pandoc->runWith($text, $options); + + $text = str_replace('\_', '_', $text); + + if ($add_meta) { + $text = $frontmatter . $text; + } + + if (substr($output_path, -1) != '/') $output_path = $output_path . '/'; + + $directory = $output_path . $directory; + + // create directory if necessary + if(!empty($directory)) { + if(!file_exists($directory)) { + mkdir($directory); + } + + $directory = $directory . '/'; + } + + // create file + $file = fopen(normalizePath($directory . $filename . '.md'), 'w'); + fwrite($file, $text); + fclose($file); + + $count++; + +} + + +// Rename and move files with the same name as directories +if (!empty($directory_list) && !empty($arguments['indexes'])) { + + $directory_list = array_keys($directory_list); + + foreach ($directory_list as $directory_name) { + + if(file_exists($output_path . $directory_name . '.md')) { + rename($output_path . $directory_name . '.md', $output_path . $directory_name . '/index.md'); + } + } + +} + +if ($count > 0) { + echo "$count files converted" . PHP_EOL . PHP_EOL; +} + + +function arguments($argv) { + $_ARG = array(); + foreach ($argv as $arg) { + if (preg_match('/--([^=]+)=(.*)/',$arg,$reg)) { + $_ARG[$reg[1]] = $reg[2]; + } elseif(preg_match('/-([a-zA-Z0-9])/',$arg,$reg)) { + $_ARG[$reg[1]] = 'true'; + } + + } + return $_ARG; +} + + +function new_link($matches){ + if(strpos($matches[1], '|') != true) { + $new_link = str_replace(' ', '_', $matches[1]); + return "[[/$new_link|${matches[1]}]]"; + } else { + + $link = trim(substr($matches[1], 0, strpos($matches[1], '|'))); + $link = '/' . str_replace(' ', '_', $link); + + $link_text = trim(substr($matches[1], strpos($matches[1], '|')+1)); + + return "[[$link|$link_text]]"; + } +} + + +// Borrowed from http://php.net/manual/en/function.realpath.php +function normalizePath($path) +{ + $parts = array(); // Array to build a new path from the good parts + $path = str_replace('\\', '/', $path); // Replace backslashes with forwardslashes + $path = preg_replace('/\/+/', '/', $path);// Combine multiple slashes into a single slash + $segments = explode('/', $path); // Collect path segments + $test = ''; // Initialize testing variable + foreach($segments as $segment) + { + if($segment != '.') + { + $test = array_pop($parts); + if(is_null($test)) + $parts[] = $segment; + else if($segment == '..') + { + if($test == '..') + $parts[] = $test; + if($test == '..' || $test == '') + $parts[] = $segment; + } + else + { + $parts[] = $test; + $parts[] = $segment; + } + } + } + return implode('/', $parts); +} + + +?> diff --git a/.tools/mw2md/convert.ps1 b/.tools/mw2md/convert.ps1 new file mode 100644 index 0000000..fe18cf4 --- /dev/null +++ b/.tools/mw2md/convert.ps1 @@ -0,0 +1,40 @@ +[CmdletBinding()] +Param( + [string]$convertFileArgFullPath = "wiki.xml" +) + +function RefreshDirectory([string] $pathToDirectory) +{ + if (!(Test-Path $pathToDirectory -PathType Container)) + { + Write-Host -Message "Creating $pathToDirectory." + New-Item -Path $pathToDirectory -Type directory | out-null + } + else + { + Write-Host -Message "Deleting and recreating $pathToDirectory." + Remove-Item $pathToDirectory -Recurse -Force + New-Item -Path $pathToDirectory -Type directory | out-null + } +} + +if(!$PSScriptRoot){ + $PSScriptRoot = Split-Path $MyInvocation.MyCommand.Path -Parent +} + +$inputFolder = Join-Path $PSScriptRoot "input" +$outputFolder = Join-Path $PSScriptRoot "output" + +RefreshDirectory $inputFolder +Copy-Item $convertFileArgFullPath $inputFolder + +$fileToConvertFileName = Split-Path $convertFileArgFullPath -Leaf +$fileToConvertPathForDockerImage = "./input/"+$fileToConvertFileName + +$image = "wiki2md" + +docker build -t $image . + +RefreshDirectory $outputFolder +$dockerRunCmd = "php convert.php --filename="+$fileToConvertPathForDockerImage+" --output=./output" +docker run -v ${PSScriptRoot}/output/:/src/output $image sh -c $dockerRunCmd \ No newline at end of file