snapshot commit of tool

This commit is contained in:
Merith 2025-07-15 18:53:50 -07:00
parent 3f72a6ba5a
commit 4aef3ca47e
6 changed files with 377 additions and 25 deletions

3
.tools/mw2md/.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
composer.phar
composer.lock
vendor

View file

@ -1,26 +1,25 @@
## CREDIT https://github.com/realrubberduckdev/mediawiki-to-markdown/
FROM pandoc/latex
WORKDIR /src
COPY composer.* ./
# Install composer, refer: https://github.com/geshan/docker-php-composer-alpine/blob/master/Dockerfile
RUN apk --update add wget \
curl \
git \
php7 \
php7-curl \
php7-openssl \
php7-iconv \
php7-json \
php7-mbstring \
php7-phar \
php7-xml \
php7-simplexml \
php7-dom --repository http://nl.alpinelinux.org/alpine/edge/testing/ && rm /var/cache/apk/*
RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/bin --filename=composer
# end of install composer
RUN composer install
COPY . .
FROM pandoc/latex
WORKDIR /src
COPY composer.* ./
# Install composer, refer: https://github.com/geshan/docker-php-composer-alpine/blob/master/Dockerfile
RUN apk --update add wget \
curl \
git \
php7 \
php7-curl \
php7-openssl \
php7-iconv \
php7-json \
php7-mbstring \
php7-phar \
php7-xml \
php7-simplexml \
php7-dom --repository http://nl.alpinelinux.org/alpine/edge/testing/ && rm /var/cache/apk/*
RUN curl -sS https://getcomposer.org/installer | php -- --install-dir=/usr/bin --filename=composer
# end of install composer
RUN composer install
COPY . .
CMD ["sh"]

100
.tools/mw2md/README.md Normal file
View file

@ -0,0 +1,100 @@
MediaWiki to Markdown
=====================
> Snapshot of https://github.com/realrubberduckdev/mediawiki-to-markdown/
> commit hash `4ebf945e68984270c820e8fe6a892e0acfc6875d`
Convert MediaWiki pages to GitHub flavored Markdown (or other formats supported by Pandoc). The conversion uses an XML export from MediaWiki and converts each wiki page to an individual markdown file. Directory structures will be preserved. The generated export can also include frontmatter for Github pages.
You may also be interested in a forked version of this codebase available at https://github.com/philipashlock/mediawiki-to-markdown
You may also be interested in a forked version of this codebase available at https://github.com/outofcontrol/mediawiki-to-gfm
## Requirements
* Docker
* Powershell
## Export MediaWiki Pages
You'll export all your pages as a single XML file following these steps: http://en.wikipedia.org/wiki/Help:Export
## Run
The simplest way to run is using the `convert.ps1` script.
`.\convert.ps1 -convertFileArgFullPath C:\wiki.xml`
### Further granular run parameters
In order to use any other options, you will have update the `$dockerRunCmd` variable in `convert.ps1` script. The available options are below.
####--filename####
The only required parameter is `filename` for the name of the xml file you exported from MediaWiki, eg:
`php convert.php --filename=mediawiki.xml`
####--output####
You can also use `output` to specify an output folder since each wiki page in the XML file will generate it's own separate markdown file.
`php convert.php --filename=mediawiki.xml --output=export`
####--indexes####
You can set `indexes` as `true` if you want pages with the same name as a directory to be renamed as index.md and placed into their directory
`php convert.php --filename=mediawiki.xml --output=export --indexes=true`
####--frontmatter####
You can specify whether you want frontmatter included. This is automatically set to `true` when the output format is `markdown_github`
`php convert.php --filename=mediawiki.xml --output=export --format=markdown_phpextra --frontmatter=true`
####--format####
You can specify different output formats with `format`. The default is `markdown_github`. See
`php convert.php --filename=mediawiki.xml --output=export --format=markdown_phpextra`
Supported pandoc formats are:
* asciidoc
* beamer
* context
* docbook
* docx
* dokuwiki
* dzslides
* epub
* epub3
* fb2
* haddock
* html
* html5
* icml
* json
* latex
* man
* markdown
* markdown_github
* markdown_mmd
* markdown_phpextra
* markdown_strict
* mediawiki
* native
* odt
* opendocument
* opml
* org
* plain
* revealjs
* rst
* rtf
* s5
* slideous
* slidy
* texinfo
* textile

View file

@ -0,0 +1,5 @@
{
"require": {
"ryakad/pandoc-php": "dev-master"
}
}

205
.tools/mw2md/convert.php Normal file
View file

@ -0,0 +1,205 @@
<?php
$arguments = arguments($argv);
require 'vendor/autoload.php';
ini_set('memory_limit', '-1');
// Load arguments passed from CLI
if(empty($arguments['filename'])) {
echo "No input file specified. Use --filename=mediawiki.xml" . PHP_EOL . PHP_EOL;
exit;
}
if(!empty($arguments['output'])) {
$output_path = $arguments['output'];
if(!file_exists($output_path)) {
echo "Creating output directory $output_path" . PHP_EOL . PHP_EOL;
mkdir($output_path);
}
} else {
$output_path = '';
}
if(!empty($arguments['format'])) {
$format = $arguments['format'];
} else {
$format = 'markdown_github';
}
if(!empty($arguments['fm']) OR (empty($arguments['fm']) && $format == 'markdown_github')) {
$add_meta = true;
} else {
$add_meta = false;
}
// Load XML file
$file = file_get_contents($arguments['filename']);
$xml = str_replace('xmlns=', 'ns=', $file); //$string is a string that contains xml...
$xml = new SimpleXMLElement($xml);
$result = $xml->xpath('page');
$count = 0;
$directory_list = array();
// Iterate through XML
while(list( , $node) = each($result)) {
$title = $node->xpath('title');
$title = $title[0];
$url = str_replace(' ', '_', $title);
if($slash = strpos($url, '/')){
$title = str_replace('/', ' ', $title);
$directory = substr($url, 0, $slash);
$filename = substr($url, $slash+1);
$directory_list[$directory] = true;
} else {
$directory = '';
$filename = $url;
}
$text = $node->xpath('revision/text');
$text = $text[0];
$text = html_entity_decode($text); // decode inline html
$text = preg_replace_callback('/\[\[(.+?)\]\]/', "new_link", $text); // adds leading slash to links, "absolute-path reference"
// prepare to append page title frontmatter to text
if ($add_meta) {
$frontmatter = "---\n";
$frontmatter .= "title: $title\n";
$frontmatter .= "permalink: /$url/\n";
$frontmatter .= "---\n\n";
}
$pandoc = new Pandoc\Pandoc();
$options = array(
"from" => "mediawiki",
"to" => $format
);
$text = $pandoc->runWith($text, $options);
$text = str_replace('\_', '_', $text);
if ($add_meta) {
$text = $frontmatter . $text;
}
if (substr($output_path, -1) != '/') $output_path = $output_path . '/';
$directory = $output_path . $directory;
// create directory if necessary
if(!empty($directory)) {
if(!file_exists($directory)) {
mkdir($directory);
}
$directory = $directory . '/';
}
// create file
$file = fopen(normalizePath($directory . $filename . '.md'), 'w');
fwrite($file, $text);
fclose($file);
$count++;
}
// Rename and move files with the same name as directories
if (!empty($directory_list) && !empty($arguments['indexes'])) {
$directory_list = array_keys($directory_list);
foreach ($directory_list as $directory_name) {
if(file_exists($output_path . $directory_name . '.md')) {
rename($output_path . $directory_name . '.md', $output_path . $directory_name . '/index.md');
}
}
}
if ($count > 0) {
echo "$count files converted" . PHP_EOL . PHP_EOL;
}
function arguments($argv) {
$_ARG = array();
foreach ($argv as $arg) {
if (preg_match('/--([^=]+)=(.*)/',$arg,$reg)) {
$_ARG[$reg[1]] = $reg[2];
} elseif(preg_match('/-([a-zA-Z0-9])/',$arg,$reg)) {
$_ARG[$reg[1]] = 'true';
}
}
return $_ARG;
}
function new_link($matches){
if(strpos($matches[1], '|') != true) {
$new_link = str_replace(' ', '_', $matches[1]);
return "[[/$new_link|${matches[1]}]]";
} else {
$link = trim(substr($matches[1], 0, strpos($matches[1], '|')));
$link = '/' . str_replace(' ', '_', $link);
$link_text = trim(substr($matches[1], strpos($matches[1], '|')+1));
return "[[$link|$link_text]]";
}
}
// Borrowed from http://php.net/manual/en/function.realpath.php
function normalizePath($path)
{
$parts = array(); // Array to build a new path from the good parts
$path = str_replace('\\', '/', $path); // Replace backslashes with forwardslashes
$path = preg_replace('/\/+/', '/', $path);// Combine multiple slashes into a single slash
$segments = explode('/', $path); // Collect path segments
$test = ''; // Initialize testing variable
foreach($segments as $segment)
{
if($segment != '.')
{
$test = array_pop($parts);
if(is_null($test))
$parts[] = $segment;
else if($segment == '..')
{
if($test == '..')
$parts[] = $test;
if($test == '..' || $test == '')
$parts[] = $segment;
}
else
{
$parts[] = $test;
$parts[] = $segment;
}
}
}
return implode('/', $parts);
}
?>

40
.tools/mw2md/convert.ps1 Normal file
View file

@ -0,0 +1,40 @@
[CmdletBinding()]
Param(
[string]$convertFileArgFullPath = "wiki.xml"
)
function RefreshDirectory([string] $pathToDirectory)
{
if (!(Test-Path $pathToDirectory -PathType Container))
{
Write-Host -Message "Creating $pathToDirectory."
New-Item -Path $pathToDirectory -Type directory | out-null
}
else
{
Write-Host -Message "Deleting and recreating $pathToDirectory."
Remove-Item $pathToDirectory -Recurse -Force
New-Item -Path $pathToDirectory -Type directory | out-null
}
}
if(!$PSScriptRoot){
$PSScriptRoot = Split-Path $MyInvocation.MyCommand.Path -Parent
}
$inputFolder = Join-Path $PSScriptRoot "input"
$outputFolder = Join-Path $PSScriptRoot "output"
RefreshDirectory $inputFolder
Copy-Item $convertFileArgFullPath $inputFolder
$fileToConvertFileName = Split-Path $convertFileArgFullPath -Leaf
$fileToConvertPathForDockerImage = "./input/"+$fileToConvertFileName
$image = "wiki2md"
docker build -t $image .
RefreshDirectory $outputFolder
$dockerRunCmd = "php convert.php --filename="+$fileToConvertPathForDockerImage+" --output=./output"
docker run -v ${PSScriptRoot}/output/:/src/output $image sh -c $dockerRunCmd