diff options
author | Wade Brainerd <wadetb@gmail.com> | 2008-05-23 22:58:23 (GMT) |
---|---|---|
committer | Wade Brainerd <wadetb@gmail.com> | 2008-05-23 22:58:23 (GMT) |
commit | dd58bf72d6799438d8033cf7de6bc26a711734c3 (patch) | |
tree | 12c5c75e465b5c7889857116839f4dcd215fc230 /mwlib | |
parent | 02242194e156b00cef18506ab37d4a51ba36ac57 (diff) |
Rename step 1. Breaking the renames up because Git fails to recognize them when too many are done at once.
Diffstat (limited to 'mwlib')
48 files changed, 16902 insertions, 0 deletions
diff --git a/mwlib/EasyTimeline.pl b/mwlib/EasyTimeline.pl new file mode 100755 index 0000000..6486224 --- /dev/null +++ b/mwlib/EasyTimeline.pl @@ -0,0 +1,4718 @@ +#!/usr/bin/env perl + +# Copyright (C) 2004 Erik Zachte , email xxx\@chello.nl (nospam: xxx=epzachte) +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License version 2 +# as published by the Free Software Foundation. +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# See the GNU General Public License for more details, at +# http://www.fsf.org/licenses/gpl.html + +# history: +# 1.5 May 27 2004 : +# - when a chart contains only one bar this bar was always centered in the image +# now AlignBars works well in this case aslo ("justify" treated as "center") +# - interwiki links reinstalled e.g. [[de:Gorbachev]] +# - error msgs corrected +# - minimum image size fixed +# - line numbering adapted <timeline>spaces<br> does not count as line one in Wikipedia +# - line breaks in wiki links parsed correctly [[Vladimir~Ilyich~Lenin]] +# - partial url shown as hint for external link (in GIF/PNG) +# - BarData: no attribute 'text:..' supplied -> default to space = show no text on axis +# - PlotData: new attribute 'anchor:..' +# - revert html encoding of '<' & '>' by MediaWiki + +# 1.6 May 28 2004 : +# - SVG decode special chars in SVG input fixed +# - BarData: new attributes 'barset:..' and 'barcount:..' # autoincrement bar id +# - PlotData: new attribute 'barset:..' +# - LineData: new attribute 'layer:..', draw lines to back or front of bars and texts + +# 1.7 +# - EscapeShellArg (Tim Starling) + +# 1.8 June .. 2004 : +# - optional autosizing of image (implied when auto incrementing bar count (also new)) +# - presentation left-right order of bars reversed on TimeAxis = orientation:vertical +# - TimeAxis option 'order:[normal|reverse]' added +# - BarData: option barcount replaced by auto incrementing bar count and 'break' and 'skip' attributes +# - DrawLines -> LineData (command renamed, but also restructured like PlotData, TextData) +# - new drawing options for LineData, now also lines parallel to time axis, or between arbitrary points +# - Preset command added (specify default settings with 'Preset =', two sets to start with) +# - 'text' attribute parsing bugs (# or : in text gave problems, spaces got lost) +# - PlotArea new attributes 'top' and 'right' make it possible to define plot area margins only +# so resizing image does not imply adjusting PlotArea 'width' and 'height' +# - PlotData option 'shift': only changing x or y value is now possible, e.g. shift=(,10) +# - command ScaleMajor: subs for time axis can now be specified verbatim in option 'text' +# - extra validation checks, defaults, etc +# - function PlotScale now provides workaround for Ploticus bug: auto incrementing dates failed + +# 1.9 June 2004 +# - stub display order fixed on non time axis + +# 1.10 July 2004 +# - tempory debug code (removed) + +# 1.11 August 2004 +# - dot in folder name in input path was misunderstood as start of file extension +# - utf-8 chars within 160-255 range are translated to extended ascii +# however internal font used by Ploticus has strange mapping so some are replaced +# by undercore or unaccented version of character +# this is a make do solution until full unicode support with external fonts will be added + + $version = "1.9" ; + + use Time::Local ; + use Getopt::Std ; + use Cwd ; + + $| = 1; # flush screen output + + print "EasyTimeline version $version\n" . + "Copyright (C) 2004 Erik Zachte\n" . + "Email xxx\@chello.nl (nospam: xxx=epzachte)\n\n" . + "This program is free software; you can redistribute it\n" . + "and/or modify it under the terms of the \n" . + "GNU General Public License version 2 as published by\n" . + "the Free Software Foundation\n" . + "------------------------------------------------------\n" ; + + &SetImageFormat ; + &ParseArguments ; + &InitFiles ; + + open "FILE_IN", "<", $file_in ; + @lines = <FILE_IN> ; + close "FILE_IN" ; + + &InitVars ; + &ParseScript ; + + if ($CntErrors == 0) + { &WritePlotFile ; } + + if ($CntErrors == 1) + { &Abort ("1 error found") ; } + elsif ($CntErrors > 1) + { &Abort ("$CntErrors errors found") ; } + else + { + if (defined @Info) + { + print "\nINFO\n" ; + print @Info ; + print "\n" ; + } + if (defined @Warnings) + { + print "\nWARNING(S)\n" ; + print @Warnings ; + print "\n" ; + } + + if (! (-e $file_bitmap)) + { + print "\nImage $file_bitmap not created.\n" ; + if ((! (-e "pl.exe")) && (! (-e "pl"))) + { print "\nPloticus not found in local folder. Is it on your system path?\n" ; } + } + elsif (! (-e $file_vector)) + { + print "\nImage $file_vector not created.\n" ; + } + else + { print "\nREADY\nNo errors found.\n" ; } + } + + exit ; + +sub ParseArguments +{ + my $options ; + getopt ("iTAPe", \%options) ; + + &Abort ("Specify input file as: -i filename") if (! defined (@options {"i"})) ; + + $file_in = @options {"i"} ; + $listinput = @options {"l"} ; # list all input lines (not recommended) + $linkmap = @options {"m"} ; # make clickmap for inclusion in html + $makehtml = @options {"h"} ; # make test html file with gif/png + svg output + $bypass = @options {"b"} ; # do not use in Wikipedia:bypass some checks + $showmap = @options {"d"} ; # debug: shows clickable areas in gif/png + # The following parameters are used by MediaWiki + # to pass config settings from LocalSettings.php to + # the perl script + $tmpdir = @options {"T"} ; # For MediaWiki: temp directory to use + $plcommand = @options {"P"} ; # For MediaWiki: full path of ploticus command + $articlepath=@options {"A"} ; # For MediaWiki: Path of an article, relative to this servers root + + if (! defined @options {"A"} ) + { $articlepath="http://en.wikipedia.org/wiki/\$1"; } + + if (! -e $file_in) + { &Abort ("Input file '" . $file_in . "' not found.") ; } +} + +sub InitVars +{ + $true = 1 ; + $false = 0 ; + $CntErrors = 0 ; + $LinkColor = "brightblue" ; + $MapPNG = $false ; # switched when link or hint found + $MapSVG = $false ; # switched when link found + $WarnTextOutsideArea = 0 ; + $WarnOnRightAlignedText = 0 ; + + $hPerc = &EncodeInput ("\%") ; + $hAmp = &EncodeInput ("\&") ; + $hAt = &EncodeInput ("\@") ; + $hDollar = &EncodeInput ("\$") ; + $hBrO = &EncodeInput ("\(") ; + $hBrC = &EncodeInput ("\)") ; + $hSemi = &EncodeInput ("\;") ; + $hIs = &EncodeInput ("\=") ; + $hLt = &EncodeInput ("\<") ; + $hGt = &EncodeInput ("\>") ; +} + +sub InitFiles +{ + print "\nInput: Script file $file_in\n" ; + + $file = $file_in ; +# 1.10 dot ignore dots in folder names -> + $file =~ s/\.[^\\\/\.]*$// ; # remove extension + $file_name = $file ; + $file_bitmap = $file . "." . $fmt ; + $file_vector = $file . ".svg" ; + $file_png = $file . ".png" ; + $file_htmlmap = $file . ".map" ; + $file_html = $file . ".html" ; + $file_errors = $file . ".err" ; +# $file_pl_info = $file . ".inf" ; +# $file_pl_err = $file . ".err" ; + print "Output: Image files $file_bitmap & $file_vector\n" ; + + if ($linkmap) + { print " Map file $file_htmlmap (add to html for clickable map)\n" ; } + if ($makehtml) + { print " HTML test file $file_html\n" ; } + + # remove previous output + if (-e $file_bitmap) { unlink $file_bitmap ; } + if (-e $file_vector) { unlink $file_vector ; } + if (-e $file_png) { unlink $file_png ; } + if (-e $file_htmlmap) { unlink $file_htmlmap ; } + if (-e $file_html) { unlink $file_html ; } + if (-e $file_errors) { unlink $file_errors ; } +} + +sub SetImageFormat +{ + $env = "" ; +# $dir = cwd() ; # is there a better way to detect OS? +# if ($dir =~ /\//) { $env = "Linux" ; $fmt = "png" ; $pathseparator = "/";} +# if ($dir =~ /\\/) { $env = "Windows" ; $fmt = "gif" ; $pathseparator = "\\";} +# cwd always to returns '/'s ? -> + $OS = $^O ; + if ($OS =~ /darwin/i) + { $env = "Linux"; $fmt = "png" ; $pathseparator = "/";} + elsif ($OS =~ /win/i) + { $env = "Windows" ; $fmt = "gif" ; $pathseparator = "\\";} + else + { $env = "Linux" ; $fmt = "png" ; $pathseparator = "/";} + + if ($env ne "") + { print "\nOS $env detected -> create image in $fmt format.\n" ; } + else + { + print "\nOS not detected. Assuming Windows -> create image in $fmt format.\n" ; + $env = "Windows" ; + } +} +sub ParseScript +{ + my $command ; # local version, $Command = global + $LineNo = 0 ; + $InputParsed = $false ; + $CommandNext = "" ; + $DateFormat = "x.y" ; + + $firstcmd = $true ; + &GetCommand ; + + &StoreColor ("white", &EncodeInput ("gray(0.999)"), "") ; + &StoreColor ("barcoldefault", &EncodeInput ("rgb(0,0.6,0)"), "") ; + + while (! $InputParsed) + { + if ($Command =~ /^\s*$/) + { &GetCommand ; next ; } + + if (! ($Command =~ /$hIs/)) + { &Error ("Invalid statement. No '=' found.") ; + &GetCommand ; next ; } + + if ($Command =~ /$hIs.*$hIs/) + { &Error ("Invalid statement. Multiple '=' found.") ; + &GetCommand ; next ; } + + my ($name, $value) = split ($hIs, $Command) ; + $name =~ s/^\s*(.*?)\s*$/$1/ ; + + if ($name =~ /PlotDividers/i) + { &Error ("Command 'PlotDividers' has been renamed to 'LineData', please adjust.") ; + &GetCommand ; next ; } + if ($name =~ /DrawLines/i) + { &Error ("Command 'DrawLines' has been renamed to 'LineData', please adjust.\n" . + " Reason for change is consistency: LineData now follows the same syntax rules as PlotData and TextData.") ; + &GetCommand ; next ; } + + if ((! ($name =~ /^(?:Define)\s/)) && + (! ($name =~ /^(?:AlignBars|BarData| + BackgroundColors|Colors|DateFormat|LineData| + ScaleMajor|ScaleMinor| + LegendLeft|LegendTop| + ImageSize|PlotArea|Legend| + Period|PlotData|Preset| + TextData|TimeAxis)$/xi))) + { &ParseUnknownCommand ; + &GetCommand ; next ; } + + $value =~ s/^\s*(.*?)\s*// ; + if (! ($name =~ /^(?:BarData|Colors|LineData|PlotData|TextData)$/i)) + { + if ((! (defined ($value))) || ($value eq "")) + { + if ($name =~ /Preset/i) + { + &Error ("$name definition incomplete. No value specified\n" . + " At the moment only one preset exists: 'TimeVertical_OneBar_UnitYear'.\n" . + " See also meta.wikipedia.org/wiki/EasyTimeline/Presets") ; + } + else + { &Error ("$name definition incomplete. No attributes specified") ; } + &GetCommand ; next ; } + } + + if ($name =~ /^(?:BackgroundColors|Colors|Period|ScaleMajor|ScaleMinor|TimeAxis)$/i) + { + my @attributes = split (" ", $value) ; + foreach $attribute (@attributes) + { + my ($attrname, $attrvalue) = split ("\:", $attribute) ; + if (! ($name."-".$attrname =~ /^(?:Colors-Value|Colors-Legend| + Period-From|Period-Till| + ScaleMajor-Color|ScaleMajor-Unit|ScaleMajor-Increment|ScaleMajor-Start| + ScaleMinor-Color|ScaleMinor-Unit|ScaleMinor-Increment|ScaleMinor-Start| + BackgroundColors-Canvas|BackgroundColors-Bars| + TimeAxis-Orientation|TimeAxis-Format)$/xi)) + { &Error ("$name definition invalid. Unknown attribute '$attrname'.") ; + &GetCommand ; next ; } + + if ((! defined ($attrvalue)) || ($attrvalue eq "")) + { &Error ("$name definition incomplete. No value specified for attribute '$attrname'.") ; + &GetCommand ; next ; } + } + } + + if ($Command =~ /^AlignBars/i) { &ParseAlignBars ; } + elsif ($Command =~ /^BackgroundColors/i) { &ParseBackgroundColors ; } + elsif ($Command =~ /^BarData/i) { &ParseBarData ; } + elsif ($Command =~ /^Colors/i) { &ParseColors ; } + elsif ($Command =~ /^DateFormat/i) { &ParseDateFormat ; } + elsif ($Command =~ /^Define/i) { &ParseDefine ; } + elsif ($Command =~ /^ImageSize/i) { &ParseImageSize ; } + elsif ($Command =~ /^Legend/i) { &ParseLegend ; } + elsif ($Command =~ /^LineData/i) { &ParseLineData ; } + elsif ($Command =~ /^Period/i) { &ParsePeriod ; } + elsif ($Command =~ /^PlotArea/i) { &ParsePlotArea ; } + elsif ($Command =~ /^PlotData/i) { &ParsePlotData ; } + elsif ($Command =~ /^Preset/i) { &ParsePreset ; } + elsif ($Command =~ /^Scale/i) { &ParseScale ; } + elsif ($Command =~ /^TextData/i) { &ParseTextData ; } + elsif ($Command =~ /^TimeAxis/i) { &ParseTimeAxis ; } + + &GetCommand ; + $firstcmd = $false ; + } + + if ($CntErrors == 0) + { &DetectMissingCommands ; } + + if ($CntErrors == 0) + { &ValidateAndNormalizeDimensions ; } +} + + +sub GetLine +{ + if ($#lines < 0) + { $InputParsed = $true ; return ("") ; } + + # running in Wikipedia context and first line empty ? + # skip first line without incrementing line count + # this is part behind <timeline> and will not be thought of as line 1 + if (defined @options {"A"}) + { + if (($#lines >= 0) && (@lines [0] =~ /^\s*$/)) + { $Line = shift (@lines) ; } + } + + $Line = "" ; + while (($#lines >= 0) && ($Line =~ /^\s*$/)) + { + $LineNo ++ ; + $Line = shift (@lines) ; + chomp ($Line) ; + + if ($listinput) + { print "$LineNo: " . &DecodeInput ($Line) . "\n" ; } + + # preserve '#' within double quotes + $Line =~ s/(\"[^\"]*\")/$a=$1,$a=~s^\#^\%\?\+^g,$a/ge ; + + $Line =~ s/#>.*?<#//g ; + if ($Line =~ /#>/) + { + $commentstart = $LineNo ; + $Line =~ s/#>.*?$// ; + } + elsif ($Line =~ /<#/) + { + undef $commentstart ; + $Line =~ s/^.*?<#//x ; + } + elsif (defined ($commentstart)) + { $Line = "" ; next ; } + + # remove single line comments (keep html char tags, like  ) + $Line =~ s/\&\#/\&\$\%/g ; + $Line =~ s/\#.*$// ; + $Line =~ s/\&\$\%/\&\#/g ; + $Line =~ s/\%\?\+/\#/g ; + $Line =~ s/\s*$//g ; + $Line =~ s/\t/ /g ; + } + + if ($Line !~ /^\s*$/) + { + $Line = &EncodeInput ($Line) ; + + if (! ($Line =~ /^\s*Define/i)) + { $Line =~ s/($hDollar[a-zA-Z0-9]+)/&GetDefine($Line,$1)/ge ; } + } + + if (($#lines < 0) && (defined ($commentstart))) + { &Error2 ("No matching end of comment found for comment block starting at line $commentstart.\n" . + "Text between \#> and <\# (multiple lines) or following \# (single line) will be treated as comment.") ; } + return ($Line) ; +} + +sub GetCommand +{ + undef (%Attributes) ; + $Command = "" ; + + if ($CommandNext ne "") + { + $Command = $CommandNext ; + $CommandNext = "" ; + } + else + { $Command = &GetLine ; } + + if ($Command =~ /^\s/) + { + &Error ("New command expected instead of data line (= line starting with spaces). Data line(s) ignored.\n") ; + $Command = &GetLine ; + while (($#lines >= 0) && ($Command =~ /^\s/)) + { $Command = &GetLine ; } + } + + if ($Command =~ /^[^\s]/) + { + $line = $Command ; + $line =~ s/^.*$hIs\s*// ; + &CollectAttributes ($line) ; + } +} + +sub GetData +{ + undef (%Attributes) ; + $Command = "" ; + $NoData = $false ; + my $line = &GetLine ; + + if ($line =~ /^[^\s]/) + { + $CommandNext = $line ; + $NoData = $true ; + return ("") ; + } + + if ($line =~ /^\s*$/) + { + $NoData = $true ; + return ("") ; + } + + $line =~ s/^\s*//g ; + &CollectAttributes ($line) ; +} + +sub CollectAttributes +{ + my $line = shift ; + + $line =~ s/(\slink\:[^\s\:]*)\:/$1'colon'/i ; # replace colon (:), would conflict with syntax + $line =~ s/(\stext\:[^\s\:]*)\:/$1'colon'/i ; # replace colon (:), would conflict with syntax + $line =~ s/(https?)\:/$1'colon'/i ; # replace colon (:), would conflict with syntax + + my $text ; + ($line, $text) = &ExtractText ($line) ; + $text =~ s/'colon'/:/ ; + + $line =~ s/( $hBrO .+? $hBrC )/&RemoveSpaces($1)/gxe ; + $line =~ s/\s*\:\s*/:/g ; + $line =~ s/([a-zA-Z0-9\_]+)\:/lc($1) . ":"/gxe ; + @Fields = split (" ", $line) ; + + $name = "" ; + foreach $field (@Fields) + { + if ($field =~ /\:/) + { + ($name, $value) = split (":", $field) ; + $name =~ s/^\s*(.*)\s*$/lc($1)/gxe ; + $value =~ s/^\s*(.*)\s*$/$1/gxe ; + if (($name ne "bar") && ($name ne "text") && ($name ne "link") && ($name ne "legend")) # && ($name ne "hint") + { $value = lc ($value) ; } + + if ($name eq "link") # restore colon + { $value =~ s/'colon'/:/ ; } + + if ($value eq "") + { + if ($name =~ /Text/i) + { $value = " " ; } + else + { &Error ("No value specified for attribute '$name'. Attribute ignored.") ; } + } + else + { @Attributes {$name} = $value ; } + } + else + { + if (defined (@Attributes {"single"})) + { &Error ("Invalid attribute '$field' ignored.\nSpecify attributes as 'name:value' pair(s).") ; } + else + { + $field =~ s/^\s*(.*)\s*$/$1/gxe ; + @Attributes {"single"} = $field ; + } + } + } + if (($name ne "") && (@Attributes {"single"} ne "")) + { + &Error ("Invalid attribute '" . @Attributes {"single"} . "' ignored.\nSpecify attributes as 'name:value' pairs.") ; + delete (@Attributes {"single"}) ; + } + + if ((defined ($text)) && ($text ne "")) + { @Attributes {"text"} = &ParseText ($text) ; } +} + +sub GetDefine +{ + my $command = shift ; + my $const = shift ; + $const = lc ($const) ; + my $value = @Consts {lc ($const)} ; + if (! defined ($value)) + { + &Error ("Unknown constant. 'Define $const = ... ' expected.") ; + return ($const); + } + return ($value) ; +} + +sub ParseAlignBars +{ + &CheckPreset ("AlignBars") ; + + $align = @Attributes {"single"} ; + if (! ($align =~ /^(?:justify|early|late)$/i)) + { &Error ("AlignBars value '$align' invalid. Specify 'justify', 'early' or 'late'.") ; return ; } + + $AlignBars = lc ($align) ; +} + +sub ParseBackgroundColors +{ + if (! &ValidAttributes ("BackgroundColors")) + { &GetData ; next ;} + + &CheckPreset ("BackGroundColors") ; + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /Canvas/i) + { + if (! &ColorPredefined ($attrvalue)) + { + if (! defined (@Colors {lc ($attrvalue)})) + { &Error ("BackgroundColors definition invalid. Attribute '$attribute': unknown color '$attrvalue'.\n" . + " Specify command 'Color' before this command.") ; return ; } + } + if (defined (@Colors {lc ($attrvalue)})) + { @Attributes {"canvas"} = @Colors { lc ($attrvalue) } ; } + else + { @Attributes {"canvas"} = lc ($attrvalue) ; } + } + elsif ($attribute =~ /Bars/i) + { + if (! defined (@Colors {lc ($attrvalue)})) + { &Error ("BackgroundColors definition invalid. Attribute '$attribute' unknown color '$attrvalue'.\n" . + " Specify command 'Color' before this command.") ; return ; } + + @Attributes {"bars"} = lc ($attrvalue) ; + } + } + + %BackgroundColors = %Attributes ; +} + +sub ParseBarData +{ + &GetData ; + if ($NoData) + { &Error ("Data expected for command 'BarData', but line is not indented.\n") ; return ; } + + my ($bar, $text, $link, $hint, $barset) ; # , $barcount) ; + + BarData: + while ((! $InputParsed) && (! $NoData)) + { + if (! &ValidAttributes ("BarData")) + { &GetData ; next ;} + + $bar = "" ; $link = "" ; $hint = "" ; $barset = "" ; # $barcount = "" ; + + my $data2 = $data ; + ($data2, $text) = &ExtractText ($data2) ; + @Attributes = split (" ", $data2) ; + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /^Bar$/i) + { + $bar = $attrvalue ; + } + elsif ($attribute =~ /^BarSet$/i) + { + $barset = $attrvalue ; + } + # elsif ($attribute =~ /^BarCount$/i) + # { + # $barcount = $attrvalue ; + # if (($barcount !~ /^\d?\d?\d$/) || ($barcount < 2) || ($barcount > 200)) + # { &Error ("BarData attribute 'barcount' invalid. Specify a number between 2 and 200\n") ; + # &GetData ; next BarData ; } + # } + elsif ($attribute =~ /^Text$/i) + { + $text = $attrvalue ; + $text =~ s/\\n/~/gs ; + if ($text =~ /\~/) + { &Warning ("BarData attribute 'text' contains ~ (tilde).\n" . + "Tilde will not be translated into newline character (only in PlotData)") ; } + if ($text =~ /\^/) + { &Warning ("BarData attribute 'text' contains ^ (caret).\n" . + "Caret will not be translated into tab character (only in PlotData)") ; } + } + elsif ($attribute =~ /^Link$/i) + { + $link = &ParseText ($attrvalue) ; + + if ($link =~ /\[.*\]/) + { &Error ("BarData attribute 'link' contains implicit (wiki style) link.\n" . + "Use implicit link style with attribute 'text' only.\n") ; + &GetData ; next BarData ; } + + $link = &EncodeURL (&NormalizeURL ($link)) ; + + $MapPNG = $true ; + } + } + + if (($bar eq "") && ($barset eq "")) + { &Error ("BarData attribute missing. Specify either 'bar' of 'barset'.\n") ; + &GetData ; next BarData ; } + + if (($bar ne "") && ($barset ne "")) + { &Error ("BarData attributes 'bar' and 'barset' are mutually exclusive.\nSpecify one of these per data line\n") ; + &GetData ; next BarData ; } + + # if (($barset ne "") && ($barcount eq "")) + # { &Error ("BarData attribute 'barset' specified without attribute 'barcount'.\n") ; + # &GetData ; next BarData ; } + + # if (($barset eq "") && ($barcount ne "")) + # { &Error ("BarData attribute 'barcount' specified without attribute 'barset'.\n") ; + # &GetData ; next BarData ; } + + if (($barset ne "") && ($link ne "")) + { &Error ("BarData attribute 'link' not valid in combination with attribute 'barset'.\n") ; + &GetData ; next BarData ; } + + if ($link ne "") + { + if ($text =~ /\[.*\]/) + { + &Warning ("BarData contains implicit link(s) in attribute 'text' and explicit attribute 'link'.\n" . + "Implicit link(s) ignored.") ; + $text =~ s/\[+ (?:[^\|]* \|)? ([^\]]*) \]+/$1/gx ; + } + + if ($hint eq "") + { $hint = &ExternalLinkToHint ($link) ; } + } + + if (($bar ne "") && ($bar !~ /[a-zA-Z0-9\_]+/)) + { &Error ("BarData attribute bar:'$bar' invalid.\nUse only characters 'a'-'z', 'A'-'Z', '0'-'9', '_'\n") ; + &GetData ; next BarData ; } + + if ($bar ne "") + { + if (@Axis {"time"} eq "x") + { push @Bars, $bar ; } + else + { unshift @Bars, $bar ; } + + if ($text ne "") + { @BarLegend {lc ($bar)} = $text ; } + else + { @BarLegend {lc ($bar)} = " " ; } + + if ($link ne "") + { @BarLink {lc ($bar)} = $link ; } + } + else + { +# for ($b = 1 ; $b <= $barcount ; $b++) +# { +# $bar = $barset . "#" . $b ; + + $bar = $barset . "#1" ; + if (@Axis {"time"} eq "x") + { push @Bars, $bar ; } + else + { unshift @Bars, $bar ; } + + if ($text ne "") + { @BarLegend {lc ($bar)} = $text . " - " . $b ; } + else + { @BarLegend {lc ($bar)} = " " ; } +# } + } + + + &GetData ; + } +} + +sub ParseColors +{ + + &GetData ; + if ($NoData) + { &Error ("Data expected for command 'Colors', but line is not indented.\n") ; return ; } + + Colors: + while ((! $InputParsed) && (! $NoData)) + { + if (! &ValidAttributes ("Colors")) + { &GetData ; next ;} + + &CheckPreset ("Colors") ; + + my $addtolegend = $false ; + my $legendvalue = "" ; + my $colorvalue = "" ; + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /Id/i) + { + $colorname = $attrvalue ; + } + elsif ($attribute =~ /Legend/i) + { + $addtolegend = $true ; + $legendvalue = $attrvalue ; + if ($legendvalue =~ /^[yY]$/) + { push @LegendData, $colorname ; } + elsif (! ($attrvalue =~ /^[nN]$/)) + { + $legendvalue = &ParseText ($legendvalue) ; + push @LegendData, $legendvalue ; + } + } + elsif ($attribute =~ /Value/i) + { + $colorvalue = $attrvalue ; + if ($colorvalue =~ /^white$/i) + { $colorvalue = "gray" . $hBrO . "0.999" . $hBrC ; } + } + } + + if (&ColorPredefined ($colorvalue)) + { + &StoreColor ($colorname, $colorvalue, $legendvalue) ; + &GetData ; next Colors ; + } + + if ($colorvalue =~ /^[a-z]+$/i) + { + if (! ($colorvalue =~ /^(?:gray|rgb|hsb)/i)) + { &Error ("Color value invalid: unknown constant '$colorvalue'.") ; + &GetData ; next Colors ; } + } + + if (! ($colorvalue =~ /^(?:gray|rgb|hsb) $hBrO .+? $hBrC/xi)) + { &Error ("Color value invalid. Specify constant or 'gray/rgb/hsb(numeric values)' ") ; + &GetData ; next Colors ; } + + if ($colorvalue =~ /^gray/i) + { + if ($colorvalue =~ /gray $hBrO (?:0|1|0\.\d+) $hBrC/xi) + { &StoreColor ($colorname, $colorvalue, $legendvalue) ; } + else + { &Error ("Color value invalid. Specify 'gray(x) where 0 <= x <= 1' ") ; } + + &GetData ; next Colors ; + } + + if ($colorvalue =~ /^rgb/i) + { + my $colormode = substr ($colorvalue,0,3) ; + if ($colorvalue =~ /rgb $hBrO + (?:0|1|0\.\d+) \, + (?:0|1|0\.\d+) \, + (?:0|1|0\.\d+) + $hBrC/xi) + { &StoreColor ($colorname, $colorvalue, $legendvalue) ; } + else + { &Error ("Color value invalid. Specify 'rgb(r,g,b) where 0 <= r,g,b <= 1' ") ; } + + &GetData ; next Colors ; + } + + if ($colorvalue =~ /^hsb/i) + { + my $colormode = substr ($colorvalue,0,3) ; + if ($colorvalue =~ /hsb $hBrO + (?:0|1|0\.\d+) \, + (?:0|1|0\.\d+) \, + (?:0|1|0\.\d+) + $hBrC/xi) + { &StoreColor ($colorname, $colorvalue, $legendvalue) ; } + else + { &Error ("Color value invalid. Specify 'hsb(h,s,b) where 0 <= h,s,b <= 1' ") ; } + + &GetData ; next Colors ; + } + + &Error ("Color value invalid.") ; + &GetData ; + } +} + +sub StoreColor +{ + my $colorname = shift ; + my $colorvalue = shift ; + my $legendvalue = shift ; + if (defined (@Colors {lc ($colorname)})) + { &Warning ("Color '$colorname' redefined.") ; } + @Colors {lc ($colorname)} = lc ($colorvalue) ; + if ((defined ($legendvalue)) && ($legendvalue ne "")) + { @ColorLabels {lc ($colorname)} = $legendvalue ; } +} + +sub ParseDateFormat +{ + &CheckPreset ("DateFormat") ; + + my $datevalue = lc (@Attributes {"single"}) ; + $datevalue =~ s/\s//g ; + $datevalue = lc ($datevalue) ; + if (($datevalue ne "dd/mm/yyyy") && ($datevalue ne "mm/dd/yyyy") && ($datevalue ne "yyyy") && ($datevalue ne "x.y")) + { &Error ("Invalid DateFormat. Specify as 'dd/mm/yyyy', 'mm/dd/yyyy', 'yyyy' or 'x.y'\n" . + " (use first two only for years >= 1800)\n") ; return ; } + + $DateFormat = $datevalue ; +} + +sub ParseDefine +{ + my $command = $Command ; + my $command2 = $command ; + $command2 =~ s/^Define\s*//i ; + + my ($name, $value) = split ($hIs, $command2) ; + $name =~ s/^\s*(.*?)\s*$/$1/g ; + $value =~ s/^\s*(.*?)\s*$/$1/g ; + + if (! ($name =~ /^$hDollar/)) + { &Error ("Define '$name' invalid. Name does not start with '\$'.") ; return ; } + if (! ($name =~ /^$hDollar[a-zA-Z0-9\_]+$/)) + { &Error ("Define '$name' invalid. Valid characters are 'a'-'z', 'A'-'Z', '0'-'9', '_'.") ; return ; } + + $value =~ s/($hDollar[a-zA-Z0-9]+)/&GetDefine($command,$1)/ge ; + @Consts {lc ($name)} = $value ; +} + +sub ParseLineData +{ + &GetData ; + if ($NoData) + { &Error ("Data expected for command 'LineData', but line is not indented.\n") ; return ; } + + if ((! (defined ($DateFormat))) || (! (defined (@Period {"from"})))) + { + if (! (defined ($DateFormat))) + { &Error ("LineData invalid. No (valid) command 'DateFormat' specified in previous lines.") ; } + else + { &Error ("LineData invalid. No (valid) command 'Period' specified in previous lines.") ; } + + while ((! $InputParsed) && (! $NoData)) + { &GetData ; } + return ; + } + + my ($at, $from, $till, $atpos, $frompos, $tillpos, $color, $layer, $width, $points, $explanation) ; + + $layer = "front" ; + $width = 2.0 ; + + my $data2 = $data ; + + LineData: + while ((! $InputParsed) && (! $NoData)) + { + $at = "" ; $from = "" ; $till = "" ; $atpos = "" ; $frompos = "" ; $tillpos = "" ; $points = "" ; + + &CheckPreset ("LineData") ; + + if (! &ValidAttributes ("LineData")) + { &GetData ; next ;} + + if (defined (@LineDefs {"color"})) { $color = @LineDefs {"color"} ; } + if (defined (@LineDefs {"layer"})) { $layer = @LineDefs {"layer"} ; } + if (defined (@LineDefs {"width"})) { $width = @LineDefs {"width"} ; } + if (defined (@LineDefs {"frompos"})) { $frompos = @LineDefs {"frompos"} ; } + if (defined (@LineDefs {"tillpos"})) { $tillpos = @LineDefs {"tillpos"} ; } + if (defined (@LineDefs {"atpos"})) { $atpos = @LineDefs {"atpos"} ; } + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /^(?:At|From|Till)$/i) + { + if ($attrvalue =~ /^Start$/i) + { $attrvalue = @Period {"from"} ; } + + if ($attrvalue =~ /^End$/i) + { $attrvalue = @Period {"till"} ; } + + if (! &ValidDateFormat ($attrvalue)) + { &Error ("LineData attribute '$attribute' invalid.\n" . + "Date does not conform to specified DateFormat '$DateFormat'.") ; + &GetData ; next LineData ; } + + if (! &ValidDateRange ($attrvalue)) + { &Error ("LineData attribute '$attribute' invalid.\n" . + "Date '$attrvalue' not within range as specified by command Period.") ; + &GetData ; next LineData ; } + +# if (substr ($attrvalue,6,4) < 1800) +# { &Error ("LineData attribute '$attribute' invalid. Specify year >= 1800.") ; +# &GetData ; next LineData ; } + + if ($attribute =~ /At/i) + { + $at = $attrvalue ; $from = "" ; $till = "" ; } + elsif ($attribute =~ /From/i) + { $from = $attrvalue ; $at = "" ; } + else + { $till = $attrvalue ; $at = "" ; } + } + elsif ($attribute =~ /^(?:atpos|frompos|tillpos)$/i) + { + if ($attrvalue =~ /^(?:Start|End)$/i) + { $attrvalue = lc ($attrvalue) ; } + elsif (! &ValidAbs ($attrvalue)) + { &Error ("LineData attribute '$attribute' invalid.\n" . + "Specify value as x[.y][px, in, cm] examples: '200', '20px', '1.3in'") ; + &GetData ; next LineData ; } + + if ($attribute =~ /atpos/i) + { $atpos = &Normalize ($attrvalue) ; } + elsif ($attribute =~ /frompos/i) + { $frompos = &Normalize ($attrvalue) ; } + else + { $tillpos = &Normalize ($attrvalue) ; } + } + elsif ($attribute =~ /Color/i) + { + if ((! &ColorPredefined ($attrvalue)) && (! defined (@Colors {lc ($attrvalue)}))) + { &Error ("LineData attribute '$attribute' invalid. Unknown color '$attrvalue'.\n" . + " Specify command 'Color' before this command.") ; + &GetData ; next LineData ; } + + if (! &ColorPredefined ($attrvalue)) + { $attrvalue = @Colors {lc ($attrvalue)} ; } + + $color = $attrvalue ; + } + elsif ($attribute =~ /Layer/i) + { + if (! ($attrvalue =~ /^(?:back|front)$/i)) + { &Error ("LineData attribute '$attrvalue' invalid.\nSpecify back(default) or front") ; + &GetData ; next LineData ; } + + $layer = $attrvalue ; + } + elsif ($attribute =~ /Points/i) + { + $attribute =~ s/\s//g ; + + if ($attrvalue !~ /^$hBrO\d+\,\d+$hBrC$hBrO\d+\,\d+$hBrC$/) + { &Error ("LineData attribute '$attrvalue' invalid.\nSpecify 'points:(x1,y1)(x2,y2)'") ; + &GetData ; next LineData ; } + + $attrvalue =~ s/^$hBrO(\d+)\,(\d+)$hBrC$hBrO(\d+)\,(\d+)$hBrC$/$1,$2,$3,$4/ ; + $points = $attrvalue ; + } + elsif ($attribute =~ /Width/i) + { + if (! &ValidAbs ($attrvalue)) + { &Error ("LineData attribute '$attribute' invalid.\n" . + "Specify value as x[.y][px, in, cm] examples: '200', '20px', '1.3in'") ; + &GetData ; next LineData ; } + + if (($attrvalue < 0.1) || ($attrvalue > 10)) + { &Error ("LineData attribute '$attribute' invalid.\n" . + "Specify value as between 0.1 and 10") ; + &GetData ; next LineData ; } + + $width = $attrvalue ; + } + } + + if (($at eq "") && ($from eq "") && ($till eq "") && ($points eq "")) # upd defaults + { + if ($color ne "") { @LineDefs {"color"} = $color ; } + if ($layer ne "") { @LineDefs {"layer"} = $layer ; } + if ($width ne "") { @LineDefs {"width"} = $width ; } + if ($atpos ne "") { @LineDefs {"atpos"} = $atpos ; } + if ($frompos ne "") { @LineDefs {"frompos"} = $frompos ; } + if ($tillpos ne "") { @LineDefs {"tillpos"} = $tillpos ; } + } + + if ($layer eq "") + { $layer = "back" ; } + + if ($color eq "") + { $color = "black" ; } + + $explanation = "\nA line is defined as follows:\n" . + " Perpendicular to the time axis: 'at frompos tillpos'\n" . + " Parralel to the time axis: 'from till atpos'\n" . + " Any direction: points(x1,y1)(x2,y2)\n" . + " at,from,till expect date/time values, just like with command PlotData\n" . + " frompos,tillpos,atpos,x1,x2,y1,y2 expect coordinates (e.g. pixels values)\n" ; + + if (($at ne "") && (($from ne "") || ($till ne "") || ($points ne ""))) + { &Error ("LineData attribute 'at' can not be combined with 'from', 'till' or 'points'\n" . $explanation) ; + $explanation = "" ; + &GetData ; next LineData ; } + + if ((($from ne "") && ($till eq "")) || (($from eq "") && ($till ne ""))) + { &Error ("LineData attributes 'from' and 'till' should always be specified together\n" . $explanation) ; + $explanation = "" ; + &GetData ; next LineData ; } + + if (($points ne "") && (($from ne "") || ($till ne "") || ($at ne ""))) + { &Error ("LineData attribute 'points' can not be combined with 'at', 'from' or 'till'\n" . $explanation) ; + $explanation = "" ; + &GetData ; next LineData ; } + + if ($at ne "") + { push @DrawLines, sprintf ("1|%s|%s|%s|%s|%s|%s\n", $at, $frompos, $tillpos, lc ($color), $width, lc ($layer)) ; } + + if ($from ne "") + { push @DrawLines, sprintf ("2|%s|%s|%s|%s|%s|%s\n", $atpos, $from, $till, lc ($color), $width, lc ($layer)) ; } + + if ($points ne "") + { push @DrawLines, sprintf ("3|%s|%s|%s|%s\n", $points, lc ($color), $width, lc ($layer)) ; } + &GetData ; + } +} + +sub ParseImageSize +{ + if (! &ValidAttributes ("ImageSize")) { return ; } + + &CheckPreset ("ImageSize") ; + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /Width|Height/i) + { + if ($attrvalue !~ /auto/i) + { + if (! &ValidAbs ($attrvalue)) + { &Error ("ImageSize attribute '$attribute' invalid.\n" . + "Specify value as x[.y][px, in, cm] examples: '200', '20px', '1.3in'") ; return ; } + } + } + + elsif ($attribute =~ /BarIncrement/i) + { + if (! &ValidAbs ($attrvalue)) + { &Error ("ImageSize attribute '$attribute' invalid.\n" . + "Specify value as x[.y][px, in, cm] examples: '200', '20px', '1.3in'") ; return ; } + + @Attributes {"barinc"} = $attrvalue ; + } +# if ($attribute =~ /Width/i) +# { @Attributes {"width"} = $attrvalue ; } +# elsif ($attribute =~ /Height/i) +# { @Attributes {"height"} = $attrvalue ; } + } + + if ((@Attributes {"width"} =~ /auto/i) || (@Attributes {"height"} =~ /auto/i)) + { + if (@Attributes {"barinc"} eq "") + { &Error ("ImageSize attribute 'barincrement' missing.\n" . + "Automatic determination of image width or height implies specification of this attribute") ; return ; } + } + + if ((@Attributes {"width"} !~ /auto/i) && (@Attributes {"height"} !~ /auto/i)) + { + if (@Attributes {"barinc"} ne "") + { &Error ("ImageSize attribute 'barincrement' not valid now.\n" . + "This attribute is only valid (and mandatory) in combination with 'width:auto' or 'height:auto'") ; return ; } + } + + %Image = %Attributes ; +} + +sub ParseLegend +{ + if (! &ValidAttributes ("Legend")) { return ; } + + &CheckPreset ("Legend") ; + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /Columns/i) + { + if (($attrvalue < 1) || ($attrvalue > 4)) + { &Error ("Legend attribute 'columns' invalid. Specify 1,2,3 or 4") ; return ; } + } + elsif ($attribute =~ /Orientation/i) + { + if (! ($attrvalue =~ /^(?:hor|horizontal|ver|vertical)$/i)) + { &Error ("Legend attribute '$attrvalue' invalid. Specify hor[izontal] or ver[tical]") ; return ; } + + @Attributes {"orientation"} = substr ($attrvalue,0,3) ; + } + elsif ($attribute =~ /Position/i) + { + if (! ($attrvalue =~ /^(?:top|bottom|right)$/i)) + { &Error ("Legend attribute '$attrvalue' invalid.\nSpecify top, bottom or right") ; return ; } + } + elsif ($attribute =~ /Left/i) + { + if (! &ValidAbsRel ($attrvalue)) + { &Error ("Legend attribute '$attribute' invalid.\nSpecify value as x[.y][px, in, cm] examples: '200', '20px', '1.3in'") ; return ; } } + elsif ($attribute =~ /Top/i) + { + if (! &ValidAbsRel ($attrvalue)) + { &Error ("Legend attribute '$attribute' invalid.\nSpecify value as x[.y][px, in, cm] examples: '200', '20px', '1.3in'") ; return ; } } + elsif ($attribute =~ /ColumnWidth/i) + { + if (! &ValidAbsRel ($attrvalue)) + { &Error ("Legend attribute '$attribute' invalid.\nSpecify value as x[.y][px, in, cm] examples: '200', '20px', '1.3in'") ; return ; } + } + } + + if (defined (@Attributes {"position"})) + { + if (defined (@Attributes {"left"})) + { &Error ("Legend definition invalid. Attributes 'position' and 'left' are mutually exclusive.") ; return ; } + } + else + { + if ((! defined (@Attributes {"left"})) && (! defined (@Attributes {"top"}))) + { + &Info ("Legend definition: none of attributes 'position', 'left' or 'top' have been defined. Position 'bottom' assumed.") ; + @Attributes {"position"} = "bottom" ; + } + elsif ((! defined (@Attributes {"left"})) || (! defined (@Attributes {"top"}))) + { &Error ("Legend definition invalid. Specify 'position', or 'left' & 'top'.") ; return ; } + } + + if (@Attributes {"position"} =~ /right/i) + { + if (defined (@Attributes {"columns"})) + { &Error ("Legend definition invalid.\nAttribute 'columns' and 'position:right' are mutually exclusive.") ; return ; } + if (defined (@Attributes {"columnwidth"})) + { &Error ("Legend definition invalid.\nAttribute 'columnwidth' and 'position:right' are mutually exclusive.") ; return ; } + } + + if (@Attributes {"orientation"} =~ /hor/i) + { + if (@Attributes {"position"} =~ /right/i) + { &Error ("Legend definition invalid.\n'position:right' and 'orientation:horizontal' are mutually exclusive.") ; return ; } + if (defined (@Attributes {"columns"})) + { &Error ("Legend definition invalid.\nAttribute 'columns' and 'orientation:horizontal' are mutually exclusive.") ; return ; } + if (defined (@Attributes {"columnwidth"})) + { &Error ("Legend definition invalid.\nAttribute 'columnwidth' and 'orientation:horizontal' are mutually exclusive.") ; return ; } + } + + if ((@Attributes {"orientation"} =~ /hor/i) && (defined (@Attributes {"columns"}))) + { &Error ("Legend definition invalid.\nDo not specify attribute 'columns' with 'orientation:horizontal'.") ; return ; } + + if (@Attributes {"columns"} > 1) + { + if ((defined (@Attributes {"left"})) && (! defined (@Attributes {"columnwidth"}))) + { &Error ("Legend attribute 'columnwidth' not defined.\nThis is needed when attribute 'left' is specified.") ; return ; } + } + + if (! defined (@Attributes {"orientation"})) + { @Attributes {"orientation"} = "ver" ; } + + %Legend = %Attributes ; +} + +sub ParsePeriod +{ + if (! defined ($DateFormat)) + { &Error ("Period definition ambiguous. No (valid) command 'DateFormat' specified in previous lines.") ; return ; } + + if (! ValidAttributes ("Period")) { return ; } + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($DateFormat eq "yyyy") + { + if ($attrvalue !~ /^\-?\d+$/) + { &Error ("Period definition invalid.\nInvalid year '$attrvalue' specified for attribute '$attribute'.") ; return ; } + } + elsif ($DateFormat eq "x.y") + { + if (! ($attrvalue =~ /^\-?\d+(?:\.\d+)?$/)) + { &Error ("Period definition invalid.\nInvalid year '$attrvalue' specified for attribute '$attribute'.") ; return ; } + } + else + { + if (($attrvalue =~ /^\d+$/) && ($attrvalue >= 1800) && ($attrvalue <= 2030)) + { + if ($attribute =~ /^From$/i) + { $attrvalue = "01/01/" . $attrvalue ; } + if ($attribute =~ /^Till$/i) + { + if ($DateFormat eq "dd/mm/yyyy") + { $attrvalue = "31/12/" . $attrvalue ; } + else + { $attrvalue = "12/31/" . $attrvalue ; } + } + } + + $ValidDate = &ValidDateFormat ($attrvalue) ; + if (! $ValidDate) + { &Error ("Period attribute '$attribute' invalid.\n" . + "Date does not conform to specified DateFormat '$DateFormat'.") ; return ; } + if (substr ($attrvalue,6,4) < 1800) + { &Error ("Period attribute '$attribute' invalid. Specify year >= 1800.") ; return ; } + + @Attributes {$attribute} = $attrvalue ; + } + } + + %Period = %Attributes ; +} + +sub ParsePlotArea +{ + if (! &ValidAttributes ("PlotArea")) { return ; } + + &CheckPreset ("PlotArea") ; + + foreach $attribute (@Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + if (! &ValidAbsRel ($attrvalue)) + { &Error ("PlotArea attribute '$attribute' invalid.\n" . + "Specify value as x[.y][px, in, cm, %] examples: '200', '20px', '1.3in', '80%'") ; return ; } + } + + if ((@Attributes {"top"} ne "") && (@Attributes {"height"} ne "")) + { &Error ("PlotArea attributes 'top' and 'height' are mutually exclusive. Specify only one of them.") ; return ; } + + if ((@Attributes {"right"} ne "") && (@Attributes {"width"} ne "")) + { &Error ("PlotArea attributes 'right' and 'width' are mutually exclusive. Specify only one of them.") ; return ; } + + if ((@Attributes {"top"} eq "") && (@Attributes {"height"} eq "")) + { &Error ("PlotArea definition incomplete. Either attribute 'top' (advised) or 'height' should be specified") ; return ; } + + if ((@Attributes {"right"} eq "") && (@Attributes {"width"} eq "")) + { &Error ("PlotArea definition incomplete. Either attribute 'right' (advised) or 'width' should be specified") ; return ; } + + %PlotArea = %Attributes ; +} + +# command Bars found ? +# Y | N +# bar: found ? | bar: found ? +# Y | N | Y | N +# validate | previous bar: found? | @Bars contains | previous bar: found? +# bar:.. | | bar: ? | Y | N +# | Y | N | | copy | assume +# | copy | $#Bars .. | Y | N | bar: | bar:--- +# | bar: |== 0 | - | assume | | +# | | assume bar:--- | | bar:--- | | +# | |== 1 | +# | | assume @Bar[0] | +# | |> 1 | +# | | err | +sub ParsePlotData +{ + if (defined (@Bars)) + { $BarsCommandFound = $true ; } + else + { $BarsCommandFound = $false ; } + $prevbar = "" ; + + if ((! (defined ($DateFormat))) || (@Period {"from"} eq "") || (@Axis {"time"} eq "")) + { + if (! (defined ($DateFormat))) + { &Error ("PlotData invalid. No (valid) command 'DateFormat' specified in previous lines.") ; } + elsif (@Period {"from"} eq "") + { &Error ("PlotData invalid. No (valid) command 'Period' specified in previous lines.") ; } + else + { &Error ("PlotData invalid. No (valid) command 'TimeAxis' specified in previous lines.") ; } + + &GetData ; + while ((! $InputParsed) && (! $NoData)) + { &GetData ; } + return ; + } + + &GetData ; + if ($NoData) + { &Error ("Data expected for command 'PlotData', but line is not indented.\n") ; return ; } + + my ($bar, $at, $from, $till, $color, $bgcolor, $textcolor, $fontsize, $width, + $text, $anchor, $align, $shift, $shiftx, $shifty, $mark, $markcolor, $link, $hint) ; + + @PlotDefs {"anchor"} = "middle" ; + + PlotData: + while ((! $InputParsed) && (! $NoData)) + { + if (! &ValidAttributes ("PlotData")) + { &GetData ; next ;} + + $bar = "" ; # $barset = "" ; + $at = "" ; $from = "" ; $till = "" ; + $color = "barcoldefault" ; $bgcolor = "" ; $textcolor = "black" ; $fontsize = "S" ; $width = "0.25" ; + $text = "" ; $align = "left" ; $shift = "" ; $shiftx = "" ; $shifty = "" ; $anchor = "" ; + $mark = "" ; $markcolor = "" ; + $link = "" ; $hint = "" ; + + &CheckPreset ("PlotData") ; + + if (defined (@PlotDefs {"bar"})) { $bar = @PlotDefs {"bar"} ; } + # if (defined (@PlotDefs {"barset"})) { $barset = @PlotDefs {"barset"} ; } + if (defined (@PlotDefs {"color"})) { $color = @PlotDefs {"color"} ; } + if (defined (@PlotDefs {"bgcolor"})) { $bgcolor = @PlotDefs {"bgcolor"} ; } + if (defined (@PlotDefs {"textcolor"})) { $textcolor = @PlotDefs {"textcolor"} ; } + if (defined (@PlotDefs {"fontsize"})) { $fontsize = @PlotDefs {"fontsize"} ; } + if (defined (@PlotDefs {"width"})) { $width = @PlotDefs {"width"} ; } + if (defined (@PlotDefs {"anchor"})) { $anchor = @PlotDefs {"anchor"} ; } + if (defined (@PlotDefs {"align"})) { $align = @PlotDefs {"align"} ; } + if (defined (@PlotDefs {"shiftx"})) { $shiftx = @PlotDefs {"shiftx"} ; } + if (defined (@PlotDefs {"shifty"})) { $shifty = @PlotDefs {"shifty"} ; } + if (defined (@PlotDefs {"mark"})) { $mark = @PlotDefs {"mark"} ; } + if (defined (@PlotDefs {"markcolor"})) { $markcolor = @PlotDefs {"markcolor"} ; } +# if (defined (@PlotDefs {"link"})) { $link = @PlotDefs {"link"} ; } +# if (defined (@PlotDefs {"hint"})) { $hint = @PlotDefs {"hint"} ; } + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /^Bar$/i) + { + if (! ($attrvalue =~ /[a-zA-Z0-9\_]+/)) + { &Error ("PlotData attribute '$attribute' invalid.\n" . + "Use only characters 'a'-'z', 'A'-'Z', '0'-'9', '_'\n") ; + &GetData ; next PlotData ; } + + $attrvalue2 = $attrvalue ; + + if ($BarsCommandFound) + { + if (! &BarDefined ($attrvalue2)) + { &Error ("PlotData invalid. Bar '$attrvalue' not (properly) defined.") ; + &GetData ; next PlotData ; } + } + else + { + if (! &BarDefined ($attrvalue2)) + { + if (@Axis {"time"} eq "x") + { push @Bars, $attrvalue2 ; } + else + { unshift @Bars, $attrvalue2 ; } + } + } + $bar = $attrvalue2 ; + $prevbar = $bar ; + } + elsif ($attribute =~ /^BarSet$/i) + { + if (! ($attrvalue =~ /[a-zA-Z0-9\_]+/)) + { &Error ("PlotData attribute '$attribute' invalid.\n" . + "Use only characters 'a'-'z', 'A'-'Z', '0'-'9', '_'\n") ; + &GetData ; next PlotData ; } + + $attrvalue2 = $attrvalue ; + + if ($attrvalue =~ /break/i) + { $barndx = 0 ; } + elsif ($attrvalue =~ /skip/i) + { + $barndx ++ ; + &BarDefined ($prevbar . "#" . $barndx) ; + } + else + { + if ($BarsCommandFound) + { + if (! &BarDefined ($attrvalue2 . "#1")) + { &Error ("PlotData invalid. BarSet '$attrvalue' not (properly) defined with command BarData.") ; + &GetData ; next PlotData ; } + } + $bar = $attrvalue2 ; + if ($bar ne $prevbar) + { $barndx = 0 ; } + $prevbar = $bar ; + } + } + elsif ($attribute =~ /^(?:At|From|Till)$/i) + { + if ($attrvalue =~ /^Start$/i) + { $attrvalue = @Period {"from"} ; } + if ($attrvalue =~ /^End$/i) + { $attrvalue = @Period {"till"} ; } + + if (! &ValidDateFormat ($attrvalue)) + { + &Error ("PlotData attribute '$attribute' invalid.\n" . + "Date '$attrvalue' does not conform to specified DateFormat $DateFormat.") ; + &GetData ; next PlotData ; } + + if (! &ValidDateRange ($attrvalue)) + { &Error ("Plotdata attribute '$attribute' invalid.\n" . + "Date '$attrvalue' not within range as specified by command Period.") ; + + &GetData ; next PlotData ; } + + if ($attribute =~ /^At$/i) + { $at = $attrvalue ; } + elsif ($attribute =~ /^From$/i) + { $from = $attrvalue ; } + else + { $till = $attrvalue ; } + } +# elsif ($attribute =~ /^From$/i) +# { +# if ($attrvalue =~ /^Start$/i) +# { $attrvalue = @Period {"from"} ; } + +# if (! &ValidDateFormat ($attrvalue)) +# { &Error ("PlotData invalid.\nDate '$attrvalue' does not conform to specified DateFormat $DateFormat.") ; +# &GetData ; next PlotData ; } + +# if (! &ValidDateRange ($attrvalue)) +# { &Error ("Plotdata attribute 'from' invalid.\n" . +# "Date '$attrvalue' not within range as specified by command Period.") ; +# &GetData ; next PlotData ; } + +# $from = $attrvalue ; +# } +# elsif ($attribute =~ /^Till$/i) +# { +# if ($attrvalue =~ /^End$/i) +# { $attrvalue = @Period {"till"} ; } + +# if (! &ValidDateFormat ($attrvalue)) +# { &Error ("PlotData invalid. Date '$attrvalue' does not conform to specified DateFormat $DateFormat.") ; +# &GetData ; next PlotData ; } + +# if (! &ValidDateRange ($attrvalue)) +# { &Error ("Plotdata attribute 'till' invalid.\n" . +# "Date '$attrvalue' not within range as specified by command Period.") ; +# &GetData ; next PlotData ; } + +# $till = $attrvalue ; +# } + elsif ($attribute =~ /^Color$/i) + { + if (! &ColorPredefined ($attrvalue)) + { + if (! defined (@Colors {lc ($attrvalue)})) + { &Error ("PlotData invalid. Attribute '$attribute' has unknown color '$attrvalue'.\n" . + " Specify command 'Color' before this command.") ; + &GetData ; next PlotData ; } + } + if (defined (@Colors {lc ($attrvalue)})) + { $color = @Colors { lc ($attrvalue) } ; } + else + { $color = lc ($attrvalue) ; } + + $color = $attrvalue ; + } + elsif ($attribute =~ /^BgColor$/i) + { + if (! &ColorPredefined ($attrvalue)) + { + if (! defined (@Colors {lc ($attrvalue)})) + { &Error ("PlotData invalid. Attribute '$attribute' has unknown color '$attrvalue'.\n" . + " Specify command 'Color' before this command.") ; + &GetData ; next PlotData ; } + } + if (defined (@Colors {lc ($attrvalue)})) + { $bgcolor = @Colors { lc ($attrvalue) } ; } + else + { $bgcolor = lc ($attrvalue) ; } + } + elsif ($attribute =~ /^TextColor$/i) + { + if (! &ColorPredefined ($attrvalue)) + { + if (! defined (@Colors {lc ($attrvalue)})) + { &Error ("PlotData invalid. Attribute '$attribute' contains unknown color '$attrvalue'.\n" . + " Specify command 'Color' before this command.") ; + &GetData ; next PlotData ; } + } + if (defined (@Colors {lc ($attrvalue)})) + { $textcolor = @Colors { lc ($attrvalue) } ; } + else + { $textcolor = lc ($attrvalue) ; } + } + elsif ($attribute =~ /^Width$/i) + { + $width = &Normalize ($attrvalue) ; + if ($width > $MaxBarWidth) + { $MaxBarWidth = $width ; } + } + elsif ($attribute =~ /^FontSize$/i) + { + if (($attrvalue !~ /\d+(?:\.\d)?/) && ($attrvalue !~ /xs|s|m|l|xl/i)) + { &Error ("PlotData invalid. Specify for attribute '$attribute' a number of XS,S,M,L,XL.") ; + &GetData ; next PlotData ; } + + $fontsize = $attrvalue ; + if ($fontsize =~ /(?:XS|S|M|L|XL)/i) + { + if ($fontsize !~ /(?:xs|s|m|l|xl)/i) + { + if ($fontsize < 6) + { &Warning ("TextData attribute 'fontsize' value too low. Font size 6 assumed.\n") ; + $fontsize = 6 ; } + if ($fontsize > 30) + { &Warning ("TextData attribute 'fontsize' value too high. Font size 30 assumed.\n") ; + $fontsize = 30 ; } + } + } + } + elsif ($attribute =~ /^Anchor$/i) + { + if (! ($attrvalue =~ /^(?:from|till|middle)$/i)) + { &Error ("PlotData value '$attribute' invalid. Specify 'from', 'till' or 'middle'.") ; + &GetData ; next PlotData ; } + + $anchor = lc ($attrvalue) ; + } + elsif ($attribute =~ /^Align$/i) + { + if (! ($attrvalue =~ /^(?:left|right|center)$/i)) + { &Error ("PlotData value '$attribute' invalid. Specify 'left', 'right' or 'center'.") ; + &GetData ; next PlotData ; } + + $align = lc ($attrvalue) ; + } + elsif ($attribute =~ /^Shift$/i) + { + $shift = $attrvalue ; + $shift =~ s/$hBrO(.*?)$hBrC/$1/ ; + $shift =~ s/\s//g ; + ($shiftx2,$shifty2) = split (",", $shift) ; + if ($shiftx2 ne "") + { $shiftx = &Normalize ($shiftx2) ; } + if ($shifty2 ne "") + { $shifty = &Normalize ($shifty2) ; } + + if (($shiftx < -10) || ($shiftx > 10) || ($shifty < -10) || ($shifty > 10)) + { &Error ("PlotData invalid. Attribute '$shift', specify value(s) between -1000 and 1000 pixels = -10 and 10 inch.") ; + &GetData ; next PlotData ; } + } + elsif ($attribute =~ /^Text$/i) + { + $text = &ParseText ($attrvalue) ; + $text =~ s/\\n/\n/g ; + if ($text =~ /\^/) + { &Warning ("TextData attribute 'text' contains ^ (caret).\n" . + "Caret symbol will not be translated into tab character (use TextData when tabs are needed)") ; } + +# $text=~ s/(\[\[ [^\]]* \n [^\]]* \]\])/&NormalizeWikiLink($1)/gxe ; + $text=~ s/(\[\[? [^\]]* \n [^\]]* \]?\])/&NormalizeWikiLink($1)/gxe ; + } + elsif ($attribute =~ /^Link$/i) + { + $link = &ParseText ($attrvalue) ; + $link = &EncodeURL (&NormalizeURL ($link)) ; + } +# elsif ($attribute =~ /^Hint$/i) +# { +# $hint = &ParseText ($attrvalue) ; +# $hint =~ s/\\n/\n/g ; +# } + elsif ($attribute =~ /^Mark$/i) + { + $attrvalue =~ s/$hBrO (.*) $hBrC/$1/x ; + (@suboptions) = split (",", $attrvalue) ; + $mark = @suboptions [0] ; + if (! ($mark =~ /^(?:Line|None)$/i)) + { &Error ("PlotData invalid. Value '$mark' for attribute 'mark' unknown.") ; + &GetData ; next PlotData ; } + + if (defined (@suboptions [1])) + { + $markcolor = @suboptions [1] ; + + if (! &ColorPredefined ($markcolor)) + { + if (! defined (@Colors {lc ($markcolor)})) + { &Error ("PlotData invalid. Attribute 'mark': unknown color '$markcolor'.\n" . + " Specify command 'Color' before this command.") ; + &GetData ; next PlotData ; } + } + $markcolor = lc ($markcolor) ; + } + else + { $markcolor = "black" ; } + } + else + { &Error ("PlotData invalid. Unknown attribute '$attribute' found.") ; + &GetData ; next PlotData ; } + } + +# if ($text =~ /\[\[.*\[\[/s) +# { &Error ("PlotData invalid. Text segment '$text' contains more than one wiki link. Only one allowed.") ; +# &GetData ; next PlotData ; } + +# if (($text ne "") || ($link ne "")) +# { ($text, $link, $hint) = &ProcessWikiLink ($text, $link, $hint) ; } + + $shift = $shiftx . "," . $shifty ; + + if ($MaxBarWidth eq "") + { $MaxBarWidth = $width - 0.001 ; } + + if ($bar ne "") + { + if (! defined (@BarLegend {lc($bar)})) + { @BarLegend {lc($bar)} = $bar ; } + if (! defined (@BarWidths {$bar})) + { @BarWidths {$bar} = $width ; } # was 0 ?? + } + + if (($at eq "") && ($from eq "") && ($till eq "")) # upd defaults + { + if ($bar ne "") { @PlotDefs {"bar"} = $bar ; } +# if ($barset ne "") { @PlotDefs {"barset"} = $barset ; } + if ($color ne "") { @PlotDefs {"color"} = $color ; } + if ($bgcolor ne "") { @PlotDefs {"bgcolor"} = $bgcolor ; } + if ($textcolor ne "") { @PlotDefs {"textcolor"} = $textcolor ; } + if ($fontsize ne "") { @PlotDefs {"fontsize"} = $fontsize ; } + if ($width ne "") { @PlotDefs {"width"} = $width ; } + if ($anchor ne "") { @PlotDefs {"anchor"} = $anchor ; } + if ($align ne "") { @PlotDefs {"align"} = $align ; } + if ($shiftx ne "") { @PlotDefs {"shiftx"} = $shiftx ; } + if ($shifty ne "") { @PlotDefs {"shifty"} = $shifty ; } + if ($mark ne "") { @PlotDefs {"mark"} = $mark ; } + if ($markcolor ne "") { @PlotDefs {"markcolor"} = $markcolor ; } +# if ($link ne "") { @PlotDefs {"link"} = $link ; } +# if ($hint ne "") { @PlotDefs {"hint"} = $hint ; } + &GetData ; next PlotData ; + } + + if ($bar eq "") + { + if ($prevbar ne "") + { $bar = $prevbar ; } + else + { +# if ($BarsCommandFound) +# { + if ($#Bars > 0) + { &Error ("PlotData invalid. Specify attribute 'bar'.") ; + &GetData ; next PlotData ; } + elsif ($#Bars == 0) + { + $bar = @Bars [0] ; + &Info ($data, "PlotData incomplete. Attribute 'bar' missing, value '" . @Bars [0] . "' assumed.") ; + } + else + { $bar = "1" ; } +# } +# else +# { +# if ($#Bars > 0) +# { &Error ("PlotData invalid. Attribute 'bar' missing.") ; +# &GetData ; next PlotData ; } +# elsif ($#Bars == 0) +# { +# $bar = @Bars [0] ; +# &Info ($data, "PlotData incomplete. Attribute 'bar' missing, value '" . @Bars [0] . "' assumed.") ; +# } +# else { $bar = "1" ; } +# } + $prevbar = $bar ; + } + } + + if (&BarDefined ($bar . "#1")) # bar is actually a bar set + { + if (($from ne "") || ($at ne "") || ($text eq " ")) # data line ? + { + $barndx++ ; + if (! &BarDefined ($bar . "#" . $barndx)) + { $barndx = 1 ; } + $bar = $bar . "#" . $barndx ; + # $text = $bar ; + } + } + + if (($at ne "") && (($from ne "") || ($till ne ""))) + { &Error ("PlotData invalid. Attributes 'at' and 'from/till' are mutually exclusive.") ; + &GetData ; next PlotData ; } + + if ((($from eq "") && ($till ne "")) || (($from ne "") && ($till eq ""))) + { &Error ("PlotData invalid. Specify attribute 'at' or 'from' + 'till'.") ; + &GetData ; next PlotData ; } + + + if ($at ne "") + { + if ($text ne "") + { + if ($align eq "") + { &Error ("PlotData invalid. Attribute 'align' missing.") ; + &GetData ; next PlotData ; } + if ($fontsize eq "") + { &Error ("PlotData invalid. Attribute '[font]size' missing.") ; + &GetData ; next PlotData ; } + if ($text eq "") + { &Error ("PlotData invalid. Attribute 'text' missing.") ; + &GetData ; next PlotData ; } + } + } + else + { + if (($text ne "") && ($anchor eq "")) + { &Error ("PlotData invalid. Attribute 'anchor' missing.") ; + &GetData ; next PlotData ; } + if ($color eq "") + { &Error ("PlotData invalid. Attribute 'color' missing.") ; + &GetData ; next PlotData ; } + if ($width eq "") + { &Error ("PlotData invalid. Attribute 'width' missing.") ; + &GetData ; next PlotData ; } + } + + if ($from ne "") + { + if (($link ne "") && ($hint eq "")) + { $hint = &ExternalLinkToHint ($link) ; } + + if (($link ne "") || ($hint ne "")) + { $MapPNG = $true ; } + if ($link ne "") + { $MapSVG = $true ; } + + push @PlotBars, sprintf ("%6.3f,%s,%s,%s,%s,%s,%s,\n", $width, $bar, $from, $till, lc ($color),$link,$hint) ; + if ($width > @BarWidths {$bar}) + { @BarWidths {$bar} = $width ; } + + if ($text ne "") + { + if ($anchor eq "from") + { $at = $from ; } + elsif ($anchor eq "till") + { $at = $till ; } + else + { $at = &DateMedium ($from, $till) ; } + } + + if (($mark ne "") && ($mark !~ /none/i)) + { + push @PlotLines, sprintf ("%s,%s,%s,%s,,,\n", $bar, $from, $from, lc ($markcolor)) ; + push @PlotLines, sprintf ("%s,%s,%s,%s,,,\n", $bar, $till, $till, lc ($markcolor)) ; + $mark = "" ; + } + } + + if ($at ne "") + { + if (($mark ne "") && ($mark !~ /none/i)) + { push @PlotLines, sprintf ("%s,%s,%s,%s,,,\n", $bar, $at, $at, lc ($markcolor)) ; } + + if ($text ne "") + { + my $textdetails = "" ; + + if ($link ne "") + { + if ($text =~ /\[.*\]/) + { + &Warning ("PlotData contains implicit link(s) in attribute 'text' and explicit attribute 'link'. " . + "Implicit link(s) ignored.") ; + $text =~ s/\[+ (?:[^\|]* \|)? ([^\]]*) \]+/$1/gx ; + } + if ($hint eq "") + { $hint = &ExternalLinkToHint ($link) ; } + } + + if ($anchor eq "") + { $anchor = "middle" ; } + if ($align eq "") + { $align = "center" ; } + if ($color eq "") + { $color = "black" ; } + if ($fontsize eq "") + { $fontsize = "S" ; } + if ($adjust eq "") + { $adjust = "0,0" ; } + +# $textdetails = " textdetails: align=$align size=$size" ; +# if ($textcolor eq "") +# { $textcolor = "black" ; } +# if ($color ne "") +# { $textdetails .= " color=$textcolor" ; } + +# my ($xpos, $ypos) ; +# my $barcnt = 0 ; +# for ($b = 0 ; $b <= $#Bars ; $b++) +# { +# if (lc(@Bars [$b]) eq lc($bar)) +# { $barcnt = ($b + 1) ; last ; } +# } + +# if (@Axis {"time"} eq "x") +# { $xpos = "$at(s)" ; $ypos = "[$barcnt](s)" ; } +# else +# { $ypos = "$at(s)" ; $xpos = "[$barcnt](s)" ; } + +# if ($shift ne "") +# { +# my ($shiftx, $shifty) = split (",", $shift) ; +# if ($shiftx > 0) +# { $xpos .= "+$shiftx" ; } +# if ($shiftx < 0) +# { $xpos .= "$shiftx" ; } +# if ($shifty > 0) +# { $ypos .= "+$shifty" ; } +# if ($shifty < 0) +# { $ypos .= "$shifty" ; } +# } + + $text =~ s/\,/\#\%\$/g ; + $link =~ s/\,/\#\%\$/g ; + $hint =~ s/\,/\#\%\$/g ; + $shift =~ s/\,/\#\%\$/g ; + $textcolor =~ s/\,/\#\%\$/g ; + push @PlotText, sprintf ("%s,%s,%s,%s,%s,%s,%s,%s,%s", $at, $bar, $text, $textcolor, $fontsize, $align, $shift, $link, $hint) ; + } + } + + &GetData ; + } + + if ((! $BarsCommandFound) && ($#Bars > 1)) + { &Info2 ("PlotBars definition: no (valid) command 'BarData' found in previous lines.\nBars will presented in order of appearance in PlotData.") ; } + + $maxwidth = 0 ; + foreach $key (keys %BarWidths) + { + if (@BarWidths {$key} == 0) + { &Warning ("PlotData incomplete. No bar width defined for bar '$key', assume width from widest bar (used for line marks).") ; } + elsif (@BarWidths {$key} > $maxwidth) + { $maxwidth = @BarWidths {$key} ; } + } + foreach $key (keys %BarWidths) + { + if (@BarWidths {$key} == 0) + { @BarWidths {$key} = $maxwidth ; } + } +} + +sub ParsePreset +{ + if (! $firstcmd) + { &Error ("Specify 'Preset' command before any other commands, if desired at all.\n") ; return ; } + + $preset = @Attributes {"single"} ; + if ($preset !~ /^(?:TimeVertical_OneBar_UnitYear|TimeHorizontal_AutoPlaceBars_UnitYear)$/i) + { &Error ("Preset value invalid.\n" . + " At the moment two presets are available:\n" . + " TimeVertical_OneBar_UnitYear and TimeHorizontal_AutoPlaceBars_UnitYear\n" . + " See also meta.wikipedia.org/wiki/EasyTimeline/Presets") ; return ; } + + $Preset = $preset ; + + if ($Preset =~ /^TimeVertical_OneBar_UnitYear/i) + { + $DateFormat = "yyyy" ; + $AlignBars = "early" ; + @Axis {"format"} = "yyyy" ; + @Axis {"time"} = "y" ; + @PlotArea {"left"} = 45 ; + @PlotArea {"right"} = 10 ; + @PlotArea {"top"} = 10 ; + @PlotArea {"bottom"} = 10 ; + push @PresetList, "PlotArea|+|left|" . @PlotArea {"left"} ; + push @PresetList, "PlotArea|+|right|" . @PlotArea {"right"}; + push @PresetList, "PlotArea|+|top|" . @PlotArea {"top"} ; + push @PresetList, "PlotArea|+|bottom|" . @PlotArea {"bottom"} ; + push @PresetList, "PlotArea|-|width" ; + push @PresetList, "PlotArea|-|height" ; + push @PresetList, "Dateformat|-||yyyy" ; + push @PresetList, "TimeAxis|=|format|" . @Axis {"format"} ; + push @PresetList, "TimeAxis|=|orientation|vertical" ; + push @PresetList, "ScaleMajor|=|unit|year" ; + push @PresetList, "ScaleMinor|=|unit|year" ; + push @PresetList, "AlignBars|=||early" ; + push @PresetList, "PlotData|+|mark|" . $hBrO . "line,white" . $hBrC ; + push @PresetList, "PlotData|+|align|left" ; + push @PresetList, "PlotData|+|fontsize|S" ; + push @PresetList, "PlotData|+|width|20" ; + push @PresetList, "PlotData|+|shift|" . $hBrO . "20,0" . $hBrC ; + } + elsif ($Preset =~ /TimeHorizontal_AutoPlaceBars_UnitYear/i) + { + $DateFormat = "yyyy" ; + $AlignBars = "justify" ; + @Axis {"format"} = "yyyy" ; + @Axis {"time"} = "x" ; + @PlotArea {"left"} = 25 ; + @PlotArea {"right"} = 25 ; + @PlotArea {"top"} = 15 ; + @PlotArea {"bottom"} = 30 ; + @Image {"height"} = "auto" ; + @Image {"barinc"} = 20 ; + @BackgroundColors {"canvas"} = "gray(0.7)" ; + @Legend {"orientation"} = "ver" ; + @Legend {"left"} = @PlotArea {"left"}+10 ; + @Legend {"top"} = @PlotArea {"bottom"}+100 ; + &StoreColor ("canvas", &EncodeInput ("gray(0.7)"), "") ; + &StoreColor ("grid1", &EncodeInput ("gray(0.4)"), "") ; + &StoreColor ("grid2", &EncodeInput ("gray(0.2)"), "") ; + push @PresetList, "ImageSize|=|height|auto" ; + push @PresetList, "ImageSize|+|barincrement|20" ; + push @PresetList, "PlotArea|+|left|" . @PlotArea {"left"} ; + push @PresetList, "PlotArea|+|right|" . @PlotArea {"right"}; + push @PresetList, "PlotArea|+|top|" . @PlotArea {"top"} ; + push @PresetList, "PlotArea|+|bottom|" . @PlotArea {"bottom"} ; + push @PresetList, "PlotArea|-|width" ; + push @PresetList, "PlotArea|-|height" ; + push @PresetList, "Dateformat|-||yyyy" ; + push @PresetList, "TimeAxis|=|format|" . @Axis {"format"} ; + push @PresetList, "TimeAxis|=|orientation|horizontal" ; + push @PresetList, "ScaleMajor|=|unit|year" ; + push @PresetList, "ScaleMajor|+|grid|grid1" ; + push @PresetList, "ScaleMinor|=|unit|year" ; + push @PresetList, "AlignBars|=||justify" ; + push @PresetList, "Legend|+|orientation|" . @Legend {"orientation"} ; + push @PresetList, "Legend|+|left|" . @Legend {"left"} ; + push @PresetList, "Legend|+|top|" . @Legend {"top"} ; + push @PresetList, "PlotData|+|align|left" ; + push @PresetList, "PlotData|+|anchor|from" ; + push @PresetList, "PlotData|+|fontsize|M" ; + push @PresetList, "PlotData|+|width|15" ; + push @PresetList, "PlotData|+|textcolor|black" ; + push @PresetList, "PlotData|+|shift|" . $hBrO . "4,-6" . $hBrC ; + } +} + +sub ParseScale +{ + my ($scale) ; + + if ($Command =~ /ScaleMajor/i) + { $scale .= 'Major' ; } + else + { $scale .= 'Minor' ; } + + if (! ValidAttributes ("Scale" . $scale)) { return ; } + + &CheckPreset (Scale . $scale) ; + + @Scales {$scale} = $true ; + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /Grid/i) # preferred gridcolor instead of grid, grid allowed for compatability + { + if ((! &ColorPredefined ($attrvalue)) && (! defined (@Colors {lc ($attrvalue)}))) + { &Error ("Scale attribute '$attribute' invalid. Unknown color '$attrvalue'.\n" . + " Specify command 'Color' before this command.") ; return ; } + @Attributes {$scale . " grid"} = $attrvalue ; + delete (@Attributes {"grid"}) ; + } + elsif ($attribute =~ /Text/i) + { + $attrvalue =~ s/\~/\\n/g ; + $attrvalue =~ s/^\"//g ; + $attrvalue =~ s/\"$//g ; + @Attributes {$scale . " stubs"} = $attrvalue ; + } + elsif ($attribute =~ /Unit/i) + { + if ($DateFormat eq "yyyy") + { + if (! ($attrvalue =~ /^(?:year|years)$/i)) + { &Error ("Scale attribute '$attribute' invalid. DateFormat 'yyyy' implies 'unit:year'.") ; return ; } + } + else + { + if (! ($attrvalue =~ /^(?:year|month|day)s?$/i)) + { &Error ("Scale attribute '$attribute' invalid. Specify year, month or day.") ; return ; } + } + $attrvalue =~ s/s$// ; + @Attributes {$scale . " unit"} = $attrvalue ; + delete (@Attributes {"unit"}) ; + } + elsif ($attribute =~ /Increment/i) + { + if ((! ($attrvalue =~ /^\d+$/i)) || ($attrvalue == 0)) + { &Error ("Scale attribute '$attribute' invalid. Specify positive integer.") ; return ; } + @Attributes {$scale . " inc"} = $attrvalue ; + delete (@Attributes {"increment"}) ; + } + elsif ($attribute =~ /Start/i) + { + if (! (defined ($DateFormat))) + { &Error ("Scale attribute '$attribute' invalid.\n" . + "No (valid) command 'DateFormat' specified in previous lines.") ; return ; } + + if (($DateFormat eq "dd/mm/yyyy") || ($DateFormat eq "mm/dd/yyyy")) + { + if (($attrvalue =~ /^\d+$/) && ($attrvalue >= 1800) && ($attrvalue <= 2030)) + { $attrvalue = "01/01/" . $attrvalue ; } + } + + if (! &ValidDateFormat ($attrvalue)) + { &Error ("Scale attribute '$attribute' invalid.\n" . + "Date does not conform to specified DateFormat '$DateFormat'.") ; return ; } + + if (($DateFormat =~ /\d\d\/\d\d\/\d\d\d\d/) && (substr ($attrvalue,6,4) < 1800)) + { &Error ("Scale attribute '$attribute' invalid.\n" . + " Specify year >= 1800.") ; return ; } + + if (! &ValidDateRange ($attrvalue)) + { &Error ("Scale attribute '$attribute' invalid.\n" . + "Date '$attrvalue' not within range as specified by command Period.") ; return ; } + + @Attributes {$scale . " start"} = $attrvalue ; + delete (@Attributes {"start"}) ; + } + if ($DateFormat eq "yyyy") { @Attributes {$scale . " unit"} = "year" ; } + } + + foreach $attribute (keys %Attributes) + { @Scales {$attribute} = @Attributes {$attribute} ; } +} + +sub ParseTextData +{ + &GetData ; + if ($NoData) + { &Error ("Data expected for command 'TextData', but line is not indented.\n") ; return ; } + + my ($pos, $tabs, $fontsize, $lineheight, $textcolor, $text, $link, $hint) ; + + TextData: + while ((! $InputParsed) && (! $NoData)) + { + if (! &ValidAttributes ("TextData")) + { &GetData ; next ;} + + &CheckPreset ("TextData") ; + + $pos = "" ; $tabs = "" ; $fontsize = "" ; $lineheight = "" ; $textcolor = "" ; $link = "" ; $hint = "" ; + + if (defined (@TextDefs {"tabs"})) { $tabs = @TextDefs {"tabs"} ; } + if (defined (@TextDefs {"fontsize"})) { $fontsize = @TextDefs {"fontsize"} ; } + if (defined (@TextDefs {"lineheight"})) { $lineheight = @TextDefs {"lineheight"} ; } + if (defined (@TextDefs {"textcolor"})) { $textcolor = @TextDefs {"textcolor"} ; } + + my $data2 = $data ; + ($data2, $text) = &ExtractText ($data2) ; + @Attributes = split (" ", $data2) ; + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + if ($attribute =~ /^FontSize$/i) + { + if (($attrvalue !~ /\d+(?:\.\d)?/) && ($attrvalue !~ /^(?:xs|s|m|l|xl)$/i)) + { &Error ("TextData invalid. Attribute '$attribute': specify number of XS,S,M,L,XL.") ; + &GetData ; next TextData ; } + + $fontsize = $attrvalue ; + + if ($fontsize !~ /^(?:xs|s|m|l|xl)$/i) + { + if ($fontsize < 6) + { &Warning ("TextData attribute 'fontsize' value too low. Font size 6 assumed.\n") ; + $fontsize = 6 ; } + if ($fontsize > 30) + { &Warning ("TextData attribute 'fontsize' value too high. Font size 30 assumed.\n") ; + $fontsize = 30 ; } + } + } + elsif ($attribute =~ /^LineHeight$/i) + { + $lineheight = &Normalize ($attrvalue) ; + if (($lineheight < -0.4) || ($lineheight > 0.4)) + { + if (! $bypass) + { &Error ("TextData attribute 'lineheight' invalid.\n" . + "Specify value up to 40 pixels = 0.4 inch\n" . + "Run with option -b (bypass checks) when this is correct.\n") ; } + } + } + elsif ($attribute =~ /^Pos$/i) + { + $attrvalue =~ s/\s*$hBrO (.*) $hBrC\s*/$1/x ; + ($posx,$posy) = split (",", $attrvalue) ; + $posx = &Normalize ($posx) ; + $posy = &Normalize ($posy) ; + $pos = "$posx,$posy" ; + } + elsif ($attribute =~ /^Tabs$/i) + { + $tabs = $attrvalue ; + } + elsif ($attribute =~ /^(?:Color|TextColor)$/i) + { + if (! &ColorPredefined ($attrvalue)) + { + if (! defined (@Colors {lc ($attrvalue)})) + { &Error ("TextData invalid. Attribute '$attribute' contains unknown color '$attrvalue'.\n" . + " Specify command 'Color' before this command.") ; + &GetData ; next TextData ; } + } + if (defined (@Colors {lc ($attrvalue)})) + { $textcolor = @Colors { lc ($attrvalue) } ; } + else + { $textcolor = lc ($attrvalue) ; } + } + elsif ($attribute =~ /^Text$/i) + { + $text = $attrvalue ; + $text =~ s/\\n/~/gs ; + if ($text =~ /\~/) + { &Warning ("TextData attribute 'text' contains ~ (tilde).\n" . + "Tilde will not be translated into newline character (only in PlotData)") ; } + + } + elsif ($attribute =~ /^Link$/i) + { + $link = &ParseText ($attrvalue) ; + $link = &EncodeURL (&NormalizeURL ($link)) ; + } + } + + if ($fontsize eq "") + { $fontsize = "S" ; } + + if ($lineheight eq "") + { + if ($fontsize =~ /^(?:XS|S|M|L|XL)$/i) + { + if ($fontsize =~ /XS/i) { $lineheight = 0.11 ; } + elsif ($fontsize =~ /S/i) { $lineheight = 0.13 ; } + elsif ($fontsize =~ /M/i) { $lineheight = 0.155 ; } + elsif ($fontsize =~ /XL/i) { $lineheight = 0.24 ; } + else { $lineheight = 0.19 ; } + } + else + { + $lineheight = sprintf ("%.2f", (($fontsize * 1.2) / 100)) ; + if ($lineheight < $fontsize/100 + 0.02) + { $lineheight = $fontsize/100 + 0.02 ; } + } + } + + if ($textcolor eq "") + { $textcolor = "black" ; } + + if ($pos eq "") + { + $pos = @TextDefs {"pos"} ; + ($posx,$posy) = split (",", $pos) ; + $posy -= $lineheight ; + if ($posy < 0) + { $posy = 0 ; } + $pos = "$posx,$posy" ; + @TextDefs {"pos"} = $pos ; + } + +# if ($link ne "") +# { ($text, $link, $hint) = &ProcessWikiLink ($text, $link, $hint) ; } + + if ($text eq "") # upd defaults + { + if ($pos ne "") { @TextDefs {"pos"} = $pos ; } + if ($tabs ne "") { @TextDefs {"tabs"} = $tabs ; } + if ($fontsize ne "") { @TextDefs {"fontsize"} = $fontsize ; } + if ($textcolor ne "") { @TextDefs {"textcolor"} = $textcolor ; } + if ($lineheight ne "") { @TextDefs {"lineheight"} = $lineheight ; } + &GetData ; next TextData ; + } + + if ($link ne "") + { + if ($text =~ /\[.*\]/) + { + &Warning ("TextData contains implicit link(s) in attribute 'text' and explicit attribute 'link'.\n" . + "Implicit link(s) ignored.") ; + $text =~ s/\[+ (?:[^\|]* \|)? ([^\]]*) \]+/$1/gx ; + } + + if ($hint eq "") + { $hint = &ExternalLinkToHint ($link) ; } + } + + if ($text =~ /\[ [^\]]* \^ [^\]]* \]/x) + { + &Warning ("TextData attribute 'text' contains tab character (^) inside implicit link ([[..]]). Tab ignored.") ; + $text =~ s/(\[+ [^\]]* \^ [^\]]* \]+)/($a = $1), ($a =~ s+\^+ +g), $a/gxe ; + } + + if (defined ($tabs) && ($tabs ne "")) + { + $tabs =~ s/^\s*$hBrO (.*) $hBrC\s*$/$1/x ; + @Tabs = split (",", $tabs) ; + foreach $tab (@Tabs) + { + $tab =~ s/\s* (.*) \s*$/$1/x ; + if (! ($tab =~ /\d+\-(?:center|left|right)$/)) + { &Error ("Specify attribute 'tabs' as 'n-a,n-a,n-a,.. where n = numeric value, a = left|right|center.") ; + while ((! $InputParsed) && (! $NoData)) { &GetData ; } return ; } + } + + @Text = split ('\^', $text) ; + if ($#Text > $#Tabs + 1) + { &Error ("TextData invalid. " . $#Text . " tab characters ('^') in text, only " . ($#Tabs+1) . " tab(s) defined.") ; + &GetData ; next TextData ; } + } + + &WriteText ("^", "", 0, $posx, $posy, $text, $textcolor, $fontsize, "left", $link, $hint, $tabs) ; + + &GetData ; + } +} + +sub ParseTimeAxis +{ + if (! &ValidAttributes ("TimeAxis")) { return ; } + + &CheckPreset ("TimeAxis") ; + + foreach $attribute (keys %Attributes) + { + my $attrvalue = @Attributes {$attribute} ; + + + if ($attribute =~ /Format/i) + { + if ($attrvalue =~ /^yy$/i) + { &Error ("TimeAxis attribute '$attribute' valid but not available, waiting for bug fix.\n" . + "Please specify 'format:yyyy' instead of 'format:yy'.") ; return ; } + + if ($DateFormat eq "yyyy") + { + if (! ($attrvalue =~ /^(?:yy|yyyy)$/i)) + { &Error ("TimeAxis attribute '$attribute' invalid.\n" . + "DateFormat 'yyyy' implies 'format:yy' or 'format:yyyy'.") ; return ; } + } + } + + elsif ($attribute =~ /Order/i) + { + if ($attrvalue !~ /^(?:normal|reverse)$/i) + { &Error ("TimeAxis attribute '$attribute' invalid.\n" . + " Specify 'order:normal' (default) or 'order:reverse'\n" . + " normal =\n" . + " vertical axis: highest date on top,\n" . + " horizontal axis: highest date at right side\n" ) ; return ; } + + if (($attrvalue =~ /reverse/i) && ($DateFormat ne "yyyy")) + { &Error ("TimeAxis attribute '$attribute' invalid.\n" . + " 'order:reverse' is only possible with DateFormat=yyyy (sorry)\n") ; return ; } + + @Attributes {"order"} = lc ($attrvalue) ; + } + + elsif ($attribute =~ /Orientation/i) + { + if ($attrvalue =~ /^hor(?:izontal)?$/i) + { @Attributes {"time"} = "x" ; } + elsif ($attrvalue =~ /^ver(?:tical)?$/i) + { @Attributes {"time"} = "y" ; } + else + { &Error ("TimeAxis attribute '$attribute' invalid.\n" . + "Specify hor[izontal] or ver[tical]") ; return ; } + delete (@Attributes {"orientation"}) ; + } + } + + if (! defined (@Attributes {"format"})) + { @Attributes {"format"} = "yyyy" ; } + + %Axis = %Attributes ; +} + +sub ParseUnknownCommand +{ + $name = $Command ; + $name =~ s/[^a-zA-Z].*$// ; + &Error ("Command '$name' unknown.") ; +} + +sub RemoveSpaces +{ + my $text = shift ; + $text =~ s/\s//g ; + return ($text) ; +} + +sub DetectMissingCommands +{ + if (! defined (%Image)) { &Error2 ("Command ImageSize missing or invalid") ; } + if (! defined (%PlotArea)) { &Error2 ("Command PlotArea missing or invalid") ; } + if (! defined ($DateFormat)) { &Error2 ("Command DateFormat missing or invalid") ; } + if (! defined (@Axis {"time"})) { &Error2 ("Command TimeAxis missing or invalid") ; } + + if ((@Image {"width"} =~ /auto/i) && (@Axis {"time"} =~ /x/i)) + { &Error2 ("ImageSize value 'width:auto' only allowed with TimeAxis value 'orientation:vertical'") ; } + if ((@Image {"height"} =~ /auto/i) && (@Axis {"time"} =~ /y/i)) + { &Error2 ("ImageSize value 'height:auto' only allowed with TimeAxis value 'orientation:horizontal'") ; } +} + +sub Normalize +{ + my $number = shift ; + my $reference = shift ; + my ($val, $dim) ; + + if (($number eq "") || ($number =~ /auto/i)) + { return ($number) ; } + + $val = $number ; $val =~ s/[^\d\.\-].*$//g ; + $dim = $number ; $dim =~ s/\d//g ; + if ($dim =~ /in/i) { $number = $val ; } + elsif ($dim =~ /cm/i) { $number = $val / 2.54 ; } + elsif ($dim =~ /%/) { $number = $reference * $val / 100 ; } + else { $number = $val / 100 ; } + return (sprintf ("%.3f", $number)) ; +} + +sub ValidateAndNormalizeDimensions +{ + my ($val, $dim) ; + + if (@Image {"width"} =~ /auto/i) + { + foreach $attribute ("width","left","right") + { if (@PlotArea {$attribute} =~ /\%/) + { &Error2 ("You specified 'ImageSize = width:auto'.\n" . + " This implies absolute values in PlotArea attributes 'left', 'right' and/or 'width' (no \%).\n") ; return ; } + } + + if ((@PlotArea {"width"} ne "") || (@PlotArea {"left"} eq "") || (@PlotArea {"right"} eq "")) + { &Error2 ("You specified 'ImageSize = width:auto'.\n" . + " This implies 'PlotArea = width:auto'.\n" . + " Instead of 'width' specify plot margins with PlotArea attributes 'left' and 'right'.\n") ; return ; } + } + + + if (@Image {"height"} =~ /auto/i) + { + foreach $attribute ("height","top","bottom") + { if (@PlotArea {$attribute} =~ /\%/) + { &Error2 ("You specified 'ImageSize = height:auto'.\n" . + " This implies absolute values in PlotArea attributes 'top', 'bottom' and/or 'height' (no \%).\n") ; return ; } + } + + if ((@PlotArea {"height"} ne "") || (@PlotArea {"top"} eq "") || (@PlotArea {"bottom"} eq "")) + { &Error2 ("You specified 'ImageSize = height:auto'.\n" . + " This implies 'PlotArea = height:auto'.\n" . + " Instead of 'height' specify plot margins with PlotArea attributes 'top' and 'bottom'.\n") ; return ; } + } + + @Image {"width"} = &Normalize (@Image {"width"}) ; + @Image {"height"} = &Normalize (@Image {"height"}) ; + @Image {"barinc"} = &Normalize (@Image {"barinc"}) ; + @PlotArea {"width"} = &Normalize (@PlotArea {"width"}, @Image {"width"}) ; + @PlotArea {"height"} = &Normalize (@PlotArea {"height"}, @Image {"height"}) ; + @PlotArea {"left"} = &Normalize (@PlotArea {"left"}, @Image {"width"}) ; + @PlotArea {"right"} = &Normalize (@PlotArea {"right"}, @Image {"width"}) ; + @PlotArea {"bottom"} = &Normalize (@PlotArea {"bottom"}, @Image {"height"}) ; + @PlotArea {"top"} = &Normalize (@PlotArea {"top"}, @Image {"height"}) ; + + if (@Image {"width"} =~ /auto/i) + { + @PlotArea {"width"} = $#Bars * @Image {"barinc"} ; + @Image {"width"} = @PlotArea {"left"} + @PlotArea {"width"} + @PlotArea {"right"} ; + } + + elsif (@Image {"height"} =~ /auto/i) + { + @PlotArea {"height"} = $#Bars * @Image {"barinc"} ; + @Image {"height"} = @PlotArea {"top"} + @PlotArea {"height"} + @PlotArea {"bottom"} ; + } + + if (@PlotArea {"right"} ne "") + { @PlotArea {"width"} = @Image {"width"} - @PlotArea {"left"} - @PlotArea {"right"} ; } + + if (@PlotArea {"top"} ne "") + { @PlotArea {"height"} = @Image {"height"} - @PlotArea {"top"} - @PlotArea {"bottom"} ; } + + if ((@Image {"width"} > 16) || (@Image {"height"} > 20)) + { + if (! $bypass) + { &Error2 ("Maximum image size is 1600x2000 pixels = 16x20 inch\n" . + " Run with option -b (bypass checks) when this is correct.\n") ; return ; } + } + + if ((@Image {"width"} < 0.25) || (@Image {"height"} < 0.25)) + { + &Error2 ("Minimum image size is 25x25 pixels = 0.25x0.25 inch\n") ; + return ; + } + + if (@PlotArea {"width"} > @Image {"width"}) + { &Error2 ("Plot width larger than image width. Please adjust.\n") ; return ; } + + if (@PlotArea {"width"} < 0.2) + { &Error2 ("Plot width less than 20 pixels = 0.2 inch. Please adjust.\n") ; return ; } + + if (@PlotArea {"height"} > @Image {"height"}) + { &Error2 ("Plot height larger than image height. Please adjust.\n") ; return ; } + + if (@PlotArea {"height"} < 0.2) + { &Error2 ("Plot height less than 20 pixels = 0.2 inch. Please adjust.\n") ; return ; } + + if (@PlotArea {"left"} + @PlotArea {"width"} > @Image {"width"}) + { &Error2 ("Plot width + margins larger than image width. Please adjust.\n") ; return ; } +# @PlotArea {"left"} = @Image {"width"} - @PlotArea {"width"} ; } + + if (@PlotArea {"left"} < 0) + { @PlotArea {"left"} = 0 ; } + + if (@PlotArea {"bottom"} + @PlotArea {"height"} > @Image {"height"}) + { &Error2 ("Plot height + margins larger than image height. Please adjust.\n") ; return ; } +# @PlotArea {"bottom"} = @Image {"height"} - @PlotArea {"height"} ; } + + if (@PlotArea {"bottom"} < 0) + { @PlotArea {"bottom"} = 0 ; } + + if ((defined (@Scales {"Major"})) || + (defined (@Scales {"Minor"}))) + { + if (defined (@Scales {"Major"})) + { $margin = 0.2 ; } + else + { $margin = 0.05 ; } + + if (@Axis {"time"} eq "x") + { + if (@PlotArea {"bottom"} < $margin) + { &Error2 ("Not enough space below plot area for plotting time axis\n" . + " Specify 'PlotArea = bottom:x', where x is at least " . (100 * $margin) . " pixels = $margin inch\n") ; return ; } + } + else + { + if (@PlotArea {"left"} < $margin) + { &Error2 ("Not enough space outside plot area for plotting time axis\n" . + " Specify 'PlotArea = left:x', where x is at least " . (100 * $margin) . " pixels = $margin inch\n") ; return ; } + } + } + + if (defined (@Legend {"orientation"})) + { + if (defined (@Legend {"left"})) + { @Legend {"left"} = &Normalize (@Legend {"left"}, @Image {"width"}) ; } + if (defined (@Legend {"top"})) + { @Legend {"top"} = &Normalize (@Legend {"top"}, @Image {"height"}) ; } + if (defined (@Legend {"columnwidth"})) + { @Legend {"columnwidth"} = &Normalize (@Legend {"columnwidth"}, @Image {"width"}) ; } + + if (! defined (@Legend {"columns"})) + { + @Legend {"columns"} = 1 ; + if ((@Legend {"orientation"} =~ /ver/i) && + (@Legend {"position"} =~ /^(?:top|bottom)$/i)) + { + if ($#LegendData > 10) + { + @Legend {"columns"} = 3 ; + &Info2 ("Legend attribute 'columns' not defined. 3 columns assumed.") ; + } + elsif ($#LegendData > 5) + { + @Legend {"columns"} = 2 ; + &Info2 ("Legend attribute 'columns' not defined. 2 columns assumed.") ; + } + } + } + + if (@Legend {"position"} =~ /top/i) + { + if (! defined (@Legend {"left"})) + { @Legend {"left"} = @PlotArea {"left"} ; } + if (! defined (@Legend {"top"})) + { @Legend {"top"} = (@Image {"height"} - 0.2) ; } + if ((! defined (@Legend {"columnwidth"})) && (@Legend {"columns"} > 1)) + { @Legend {"columnwidth"} = sprintf ("%02f", ((@PlotArea {"left"} + @PlotArea {"width"} - 0.2) / @Legend {"columns"})) ; } + } + elsif (@Legend {"position"} =~ /bottom/i) + { + if (! defined (@Legend {"left"})) + { @Legend {"left"} = @PlotArea {"left"} ; } + if (! defined (@Legend {"top"})) + { @Legend {"top"} = (@PlotArea {"bottom"} - 0.4) ; } + if ((! defined (@Legend {"columnwidth"})) && (@Legend {"columns"} > 1)) + { @Legend {"columnwidth"} = sprintf ("%02f", ((@PlotArea {"left"} + @PlotArea {"width"} - 0.2) / @Legend {"columns"})) ; } + } + elsif (@Legend {"position"} =~ /right/i) + { + if (! defined (@Legend {"left"})) + { @Legend {"left"} = (@PlotArea {"left"} + @PlotArea {"width"} + 0.2) ; } + if (! defined (@Legend {"top"})) + { @Legend {"top"} = (@PlotArea {"bottom"} + @PlotArea {"height"} - 0.2) ; } + } + } + + if (! defined (@Axis {"order"})) + { @Axis {"order"} = "normal" ; } +} + +sub WriteProcAnnotate +{ + my $bar = shift ; + my $shiftx = shift ; + my $xpos = shift ; + my $ypos = shift ; + my $text = shift ; + my $textcolor = shift ; + my $fontsize = shift ; + my $align = shift ; + my $link = shift ; + my $hint = shift ; + + if (length ($text) > 250) + { &Error ("Text segments can be up to 250 characters long. This segment is " . length ($text) . " chars.\n" . + " You can either shorten the text or\n" . + " - PlotData: insert line breaks (~)\n" . + " - TextData: insert tabs (~) to produce columns\n") ; return ; } + + if ($textcolor eq "") + { $textcolor = "black" ; } + + my $textdetails = " textdetails: align=$align size=$fontsize color=$textcolor" ; + + push @PlotTextsPng, "#proc annotate\n" ; + push @PlotTextsSvg, "#proc annotate\n" ; + + push @PlotTextsPng, " location: $xpos $ypos\n" ; + push @PlotTextsSvg, " location: $xpos $ypos\n" ; + + push @PlotTextsPng, $textdetails . "\n" ; + push @PlotTextsSvg, $textdetails . "\n" ; + + $text2 = $text ; + $text2 =~ s/\[\[//g ; + $text2 =~ s/\]\]//g ; + if ($text2 =~ /^\s/) + { push @PlotTextsPng, " text: \n\\$text2\n\n" ; } + else + { push @PlotTextsPng, " text: $text2\n\n" ; } + + $text2 = $text ; + if ($link ne "") + { + # put placeholder in Ploticus input file + # will be replaced by real link after SVG generation + # this allows adding color info + push @linksSVG, &DecodeInput ($link) ; + my $lcnt = $#linksSVG ; + $text2 =~ s/\[\[ ([^\]]+) \]\]/\[$lcnt\[$1\]$lcnt\]/x ; + $text2 =~ s/\[\[ ([^\]]+) $/\[$lcnt\[$1\]$lcnt\]/x ; + $text2 =~ s/^ ([^\[]+) \]\]/\[$lcnt\[$1\]$lcnt\]/x ; + } + + $text3 = &EncodeHtml ($text2) ; + if ($text2 ne $text3) + { + # put placeholder in Ploticus input file + # will be replaced by real text after SVG generation + # Ploticus would autoscale image improperly when text contains &#xxx; tags + # because this would count as 5 chars + push @textsSVG, &DecodeInput ($text3) ; + $text3 = "{{" . $#textsSVG . "}}" ; + while (length ($text3) < length ($text2)) { $text3 .= "x" ; } + } + + if ($text3 =~ /^\s/) + { push @PlotTextsSvg, " text: \n\\$text3\n\n" ; } + else + { push @PlotTextsSvg, " text: $text3\n\n" ; } + + if ($link ne "") + { + $MapPNG = $true ; + + push @PlotTextsPng, "#proc annotate\n" ; + push @PlotTextsPng, " location: $xpos $ypos\n" ; + +# push @PlotTextsPng, " boxmargin: 0.01\n" ; + + if ($align ne "right") + { + push @PlotTextsPng, " clickmapurl: $link\n" ; + if ($hint ne "") + { push @PlotTextsPng, " clickmaplabel: $hint\n" ; } + } + else + { + if ($bar eq "") + { + if ($WarnOnRightAlignedText ++ == 0) + { &Warning2 ("Links on right aligned texts are only supported for svg output,\npending Ploticus bug fix.") ; } + return ; + } + else + { + push @PlotTextsPng, " clickmapurl: $link\&\&$shiftx\n" ; + if ($hint ne "") + { push @PlotTextsPng, " clickmaplabel: $hint\n" ; } + } + } + + $textdetails =~ s/color=[^\s]+/color=$LinkColor/ ; + push @PlotTextsPng, $textdetails . "\n" ; + + $text = &DecodeInput ($text) ; + if ($text =~ /^[^\[]+\]\]/) + { $text = "[[" . $text ; } + if ($text =~ /\[\[[^\]]+$/) + { $text .= "]]" ; } + my $pos1 = index ($text, "[[") ; + my $pos2 = index ($text, "]]") + 1 ; + if (($pos1 > -1) && ($pos2 > -1)) + { + for (my $i = 0 ; $i < length ($text) ; $i++) + { + $c = substr ($text, $i, 1) ; + if ($c ne "\n") + { + if (($i < $pos1) || ($i > $pos2)) + { substr ($text, $i, 1) = " " ; } + } + } + } + + $text =~ s/\[\[(.*?)\]\]/$1/s ; + + if ($text =~ /^\s/) + { push @PlotTextsPng, " text: \n\\$text\n\n" ; } + else + { push @PlotTextsPng, " text: $text\n\n" ; } + +# push @PlotTextsPng, "#proc rect\n" ; +# push @PlotTextsPng, " color: green\n" ; +# push @PlotTextsPng, " rectangle: 1(s)+0.25 1937.500(s)+0.06 1(s)+0.50 1937.500(s)+0.058\n" ; +# push @PlotTextsPng, "\n\n" ; + } +} + +sub WriteText +{ + my $mode = shift ; + my $bar = shift ; + my $shiftx = shift ; + my $posx = shift ; + my $posy = shift ; + my $text = shift ; + my $textcolor = shift ; + my $fontsize = shift ; + my $align = shift ; + my $link = shift ; + my $hint = shift ; + my $tabs = shift ; + my ($link2, $hint2, $tab) ; + my $outside = $false ; + if (@Axis {"order"} =~ /reverse/i) + { + if (@Axis {"time"} eq "y") + { $posy =~ s/(.*)(\(s\))/(-$1).$2/xe ; } + else + { $posx =~ s/(.*)(\(s\))/(-$1).$2/xe ; } + } + + if ($posx !~ /\(s\)/) + { + if ($posx < 0) + { $outside = $true ; } + if (@Image {"width"} !~ /auto/i) + { + if ($posx > @Image {"width"}/100) + { $outside = $true ; } + } + } + if ($posy !~ /\(s\)/) + { + if ($posy < 0) + { $outside = $true ; } + if (@Image {"height"} !~ /auto/i) + { + if ($posy > @Image {"height"}/100) + { $outside = $true ; } + } + } + if ($outside) + { + if ($WarnTextOutsideArea++ < 5) + { $text =~ s/\n/~/g ; + &Error ("Text segment '$text' falls outside image area. Text ignored.") ; } + return ; + } + + my @Tabs = split (",", $tabs) ; + foreach $tab (@Tabs) + { $tab =~ s/\s* (.*) \s*$/$1/x ; } + + $posx0 = $posx ; + my @Text ; + my $dy = 0 ; + + if ($text =~ /\[\[.*\]\]/) + { + $link = "" ; $hint = "" ; + } + + my @Text ; + if ($mode eq "^") + { @Text = split ('\^', $text) ; } + elsif ($mode eq "~") + { + @Text = split ('\n', $text) ; + + if ($fontsize =~ /^(?:XS|S|M|L|XL)$/i) + { + if ($fontsize =~ /XS/i) { $dy = 0.09 ; } + elsif ($fontsize =~ /S/i) { $dy = 0.11 ; } + elsif ($fontsize =~ /M/i) { $dy = 0.135 ; } + elsif ($fontsize =~ /XL/i) { $dy = 0.21 ; } + else { $dy = 0.16 ; } + } + else + { + $dy = sprintf ("%.2f", (($fontsize * 1.2) / 100)) ; + if ($dy < $fontsize/100 + 0.02) + { $dy = $fontsize/100 + 0.02 ; } + } + } + else + { push @Text, $text ; } + + + foreach $text (@Text) + { + if ($text !~ /^[\n\s]*$/) + { + $link2 = "" ; + $hint2 = "" ; + ($text, $link2, $hint2) = &ProcessWikiLink ($text, $link2, $hint2) ; + + if ($link2 eq "") + { + $link2 = $link ; + if (($link ne "") && ($text !~ /\[\[.*\]\]/)) + { $text = "[[" . $text . "]]" ;} + } + if ($hint2 eq "") + { $hint2 = $hint ; } + + &WriteProcAnnotate ($bar, $shiftx, $posx, $posy, $text, $textcolor, $fontsize, $align, $link2, $hint2) ; + } + + if ($#Tabs >= 0) + { + $tab = shift (@Tabs) ; + ($dx,$align) = split ("\-", $tab) ; + $posx = $posx0 + &Normalize ($dx) ; + } + if ($posy =~ /\+/) + { ($posy1, $posy2) = split ('\+', $posy) ; } + elsif ($posy =~ /.+\-/) + { + if ($posy =~ /^\-/) + { + ($sign, $posy1, $posy2) = split ('\-', $posy) ; $posy2 = -$posy2 ; + $posy1 = "-" . $posy1 ; + } + else + { ($posy1, $posy2) = split ('\-', $posy) ; $posy2 = -$posy2 ; } + } + else + { $posy1 = $posy ; $posy2 = 0 ; } + + $posy2 -= $dy ; + + if ($posy2 == 0) + { $posy = $posy1 ; } + elsif ($posy2 < 0) + { $posy = $posy1 . "$posy2" ; } + else + { $posy = $posy1 . "+" . $posy2 ; } + } +} + +sub WriteProcDrawCommandsOld +{ + my $posx = shift ; + my $posy = shift ; + my $text = shift ; + my $textcolor = shift ; + my $fontsize = shift ; + my $link = shift ; + my $hint = shift ; + + $posx0 = $posx ; + my @Text = split ('\^', $text) ; + my $align = "text" ; + foreach $text (@Text) + { + push @TextData, " mov $posx $posy\n" ; + push @TextData, " textsize $fontsize\n" ; + push @TextData, " color $textcolor\n" ; + push @TextData, " $align $text\n" ; + + + $tab = shift (@Tabs) ; + ($dx,$align) = split ("\-", $tab) ; + $posx = $posx0 + &Normalize ($dx) ; + if ($align =~ /left/i) { $align = "text" ; } + elsif ($align =~ /right/i) { $align = "rightjust" ; } + else { $align = "centext" ; } + } +} + +sub WritePlotFile +{ + &WriteTexts ; + + $script = "" ; + my ($color) ; + if (@Axis {"time"} eq "x") + { $AxisBars = "y" ; } + else + { $AxisBars = "x" ; } + +# if ((@Axis {"time"} eq "y") && ($#Bars > 0)) +# { +# undef @BarsTmp ; +# while ($#Bars >= 0) +# { push @BarsTmp, pop @Bars ; } +# @Bars = @BarsTmp ; +# } + + if ($tmpdir ne "") + { $file_script = $tmpdir.$pathseparator."EasyTimeline.txt.$$" ; } + else + { $file_script = "EasyTimeline.txt" ; } + + print "Ploticus input file = ".$file_script."\n"; + + # $fmt = "gif" ; + open "FILE_OUT", ">", $file_script ; + + #proc settings +# $script .= "#proc settings\n" ; +# $script .= " xml_encoding: utf-8\n" ; +# $script .= "\n" ; + + # proc page + $script .= "#proc page\n" ; + $script .= " dopagebox: no\n" ; + $script .= " pagesize: ". @Image {"width"} . " ". @Image {"height"} . "\n" ; + if (defined (@BackgroundColors {"canvas"})) + { $script .= " backgroundcolor: " . @BackgroundColors {"canvas"} . "\n" ; } + $script .= "\n" ; + + $barcnt = $#Bars + 1 ; + +# if ($AlignBars eq "justify") && ($#Bars > 0) +# +# given P = plotwidth in pixels +# given B = half bar width in pixels +# get U = plotwidth in units +# get x = half bar width in units +# +# first bar plotted at unit 1 +# last bar plotted at unit c +# let C = c - 1 (units between centers of lowest and highest bar) -> x = (U-C) / 2 +# +# Justify: calculate range for axis in units: +# axis starts at 1-x and ends at c+x = +# x/B = U/P -> x = BU/P (1) +# U = c+x - (1-x) = (c-1) + 2x -> x = (U-(c-1))/2 (2) +# +# (1) & (2) -> BU/P = (U-(c-1))/2 +# -> 2BU/P = U-(c-1) +# -> 2BU/P = U - C +# -> 2BU = PU - PC +# -> U (2B-P) = -PC +# -> U = -PC/(2B-P) +# P = @PlotArea {$extent} +# C = c - 1 = $#Bars +# 2B = $MaxBarWidth + if (! defined ($AlignBars)) + { + &Info2 ("AlignBars not defined. Alignment 'early' assumed.") ; + $AlignBars = "early" ; + } + + if (@Axis {"time"} eq "x") + { $extent = "height" ; } + else + { $extent = "width" ; } + + if ($MaxBarWidth > @PlotArea {$extent}) + { &Error2 ("Maximum bar width exceeds plotarea " . $extent . ".") ; return ; } + + if ($MaxBarWidth == @PlotArea {$extent}) + { @PlotArea {$extent} += 0.01 ; } + + if ($MaxBarWidth == @PlotArea {$extent}) + { + $till = 1 ; + $from = 1 ; + } + else + { + if ($AlignBars eq "justify") + { + if ($#Bars > 0) + { + $U = - (@PlotArea {$extent} * $#Bars) / ($MaxBarWidth - @PlotArea {$extent}) ; + $x = ($U - $#Bars) / 2 ; + $from = 1 - $x ; + $till = 1 + $#Bars + $x ; + } + else # one bar-> "justify" is misnomer here, treat as "center" + { + # $x = ($MaxBarWidth /2) / @PlotArea {$extent} ; + # $from = 0.5 - $x ; + # $till = $from + 1 ; + $from = 0.5 ; + $till = 1.5 ; + } + } + elsif ($AlignBars eq "early") + { + $U = $#Bars + 1 ; + if ($U == 0) + { $U = 1 ; } + $x = (($MaxBarWidth /2) * $U) / @PlotArea {$extent} ; + $from = 1 - $x ; + $till = $from + $U ; + } + elsif ($AlignBars eq "late") + { + $U = $#Bars + 1 ; + $x = (($MaxBarWidth /2) * $U) / @PlotArea {$extent} ; + $till = $U + $x ; + $from = $till - $U ; + } + } + +# if ($#Bars == 0) +# { +# $from = 1 - $MaxBarWidth ; +# $till = 1 + $MaxBarWidth ; +# } + if ($from eq $till) + { $till = $from + 1 ; } + + #proc areadef + $script .= "#proc areadef\n" ; + $script .= " rectangle: " . @PlotArea {"left"} . " " . @PlotArea {"bottom"} . " " . + sprintf ("%.2f", @PlotArea {"left"} + @PlotArea {"width"}). " " . sprintf ("%.2f", @PlotArea {"bottom"} + @PlotArea {"height"}) . "\n" ; + if (($DateFormat eq "yyyy") || ($DateFormat eq "x.y")) + { $script .= " " . @Axis {"time"} . "scaletype: linear\n" ; } # date yyyy + else + { $script .= " " . @Axis {"time"} . "scaletype: date $DateFormat\n" ; } + + if (@Axis {"order"} !~ /reverse/i) + { $script .= " " . @Axis {"time"} . "range: " . @Period{"from"} . " " . @Period{"till"} . "\n" ; } + else + { $script .= " " . @Axis {"time"} . "range: " . (-@Period{"till"}) . " " . (-@Period{"from"}) . "\n" ; } + + $script .= " " . $AxisBars . "scaletype: linear\n" ; + $script .= " " . $AxisBars . "range: " . sprintf ("%.3f", $from-0.001) . " " . sprintf ("%.3f", $till) . "\n" ; + $script .= " #saveas: A\n" ; + $script .= "\n" ; + + #proc rect (test) +# $script .= "#proc rect\n" ; +# $script .= " rectangle 1.0 1.0 1.4 1.4\n" ; +# $script .= " color gray(0.95)\n" ; +# $script .= " clickmaplabel: Vladimir Ilyich Lenin\n" ; +# $script .= " clickmapurl: http://www.wikipedia.org/wiki/Vladimir_Lenin\n" ; + + + #proc legendentry + foreach $color (sort keys %Colors) + { + $script .= "#proc legendentry\n" ; + $script .= " sampletype: color\n" ; + + if ((defined (@ColorLabels {$color})) && (@ColorLabels {$color} ne "")) + { $script .= " label: " . @ColorLabels {$color} . "\n" ; } + $script .= " details: " . @Colors {$color} . "\n" ; + $script .= " tag: $color\n" ; + $script .= "\n" ; + } + + if (defined (@BackgroundColors {"bars"})) + { + #proc getdata / #proc bars + $script .= "#proc getdata\n" ; + $script .= " delim: comma\n" ; + $script .= " data:\n" ; + + $maxwidth = 0 ; + foreach $entry (@PlotBars) + { + ($width) = split (",", $entry) ; + if ($width > $maxwidth) + { $maxwidth = $width ; } + } + + for ($b = 0 ; $b <= $#Bars ; $b++) + { $script .= ($b+1) . "," . @Period {"from"} . "," . @Period {"till"} . ",". + @BackgroundColors {"bars"} . "\n" ; } + $script .= "\n" ; + + #proc bars + $script .= "#proc bars\n" ; + $script .= " axis: " . @Axis {"time"} . "\n" ; + $script .= " barwidth: $maxwidth\n" ; + $script .= " outline: no\n" ; + if (@Axis {"time"} eq "x") + { $script .= " horizontalbars: yes\n" ; } + $script .= " locfield: 1\n" ; + $script .= " segmentfields: 2 3\n" ; + $script .= " colorfield: 4\n" ; + +# $script .= " clickmaplabel: Vladimir Ilyich Lenin\n" ; +# $script .= " clickmapurl: http://www.wikipedia.org/wiki/Vladimir_Lenin\n" ; + + $script .= "\n" ; + } + + #proc axis + if (defined (@Scales {"Minor grid"})) + { &PlotScale ("Minor", $true) ; } + if (defined (@Scales {"Major grid"})) + { &PlotScale ("Major", $true) ; } + + &PlotLines ("back") ; + + @PlotBarsNow = @PlotBars ; + &PlotBars ; + + $script .= "\n([inc3])\n\n" ; # will be replace by rects + +%x = %BarWidths ; + foreach $entry (@PlotLines) + { + ($bar) = split (",", $entry) ; + $bar =~ s/\#.*// ; + $width = @BarWidths {$bar} ; + $entry = sprintf ("%6.3f",$width) . "," . $entry ; + } + + @PlotBarsNow = @PlotLines ; + &PlotBars ; + + #proc axis + if ($#Bars > 0) + { + $scriptPng2 = "#proc " . $AxisBars . "axis\n" ; + $scriptSvg2 = "#proc " . $AxisBars . "axis\n" ; + if ($AxisBars eq "x") + { + $scriptPng2 .= " stubdetails: adjust=0,0.09\n" ; + $scriptSvg2 .= " stubdetails: adjust=0,0.09\n" ; + } + else + { + $scriptPng2 .= " stubdetails: adjust=0.09,0\n" ; + $scriptSvg2 .= " stubdetails: adjust=0.09,0\n" ; + } + $scriptPng2 .= " tics: none\n" ; + $scriptSvg2 .= " tics: none\n" ; + $scriptPng2 .= " stubrange: 1\n" ; + $scriptSvg2 .= " stubrange: 1\n" ; + if ($AxisBars eq "y") + { + $scriptPng2 .= " stubslide: -" . sprintf ("%.2f", $MaxBarWidth / 2) . "\n" ; + $scriptSvg2 .= " stubslide: -" . sprintf ("%.2f", $MaxBarWidth / 2) . "\n" ; + } + $scriptPng2 .= " stubs: text\n" ; + $scriptSvg2 .= " stubs: text\n" ; + + my ($text, $link, $hint) ; + + undef (@Bars2) ; + foreach $bar (@Bars) + { + if ($AxisBars eq "y") + { push @Bars2, $bar ; } + else + { unshift @Bars2, $bar ; } + } + + foreach $bar (@Bars2) + { + $hint = "" ; + $text = @BarLegend {lc ($bar)} ; + if ($text =~ /^\s*$/) + { $text = "\\" ; } + + $link = @BarLink {lc ($bar)} ; + if (! defined ($link)) + { + if ($text =~ /\[.*\]/) + { ($text, $link, $hint) = &ProcessWikiLink ($text, $link, $hint) ; } + } + + $text =~ s/\[+([^\]]*)\]+/$1/ ; + $scriptPng2 .= "$text\n" ; + if (defined ($link)) + { + push @linksSVG, $link ; + my $lcnt = $#linksSVG ; + $scriptSvg2 .= "[" . $lcnt . "[" . $text . "]" . $lcnt . "]\n" ; + } + else + { $scriptSvg2 .= "$text\n" ; } + } + $scriptPng2 .= "\n" ; + $scriptSvg2 .= "\n" ; + + $scriptPng2 .= "#proc " . $AxisBars . "axis\n" ; + if ($AxisBars eq "x") + { $scriptPng2 .= " stubdetails: adjust=0,0.09 color=$LinkColor\n" ; } + else + { $scriptPng2 .= " stubdetails: adjust=0.09,0 color=$LinkColor\n" ; } + $scriptPng2 .= " tics: none\n" ; + $scriptPng2 .= " stubrange: 1\n" ; + if ($AxisBars eq "y") + { $scriptPng2 .= " stubslide: -" . sprintf ("%.2f", $MaxBarWidth / 2) . "\n" ; } + $scriptPng2 .= " stubs: text\n" ; + + $barcnt = $#Bars + 1 ; + foreach $bar (@Bars2) + { + $hint = "" ; + $text = @BarLegend {lc ($bar)} ; + if ($text =~ /^\s*$/) + { $text = "\\" ; } + + $link = @BarLink {lc ($bar)} ; + if (! defined ($link)) + { + if ($text =~ /\[.*\]/) + { ($text, $link, $hint) = &ProcessWikiLink ($text, $link, $hint) ; } + } + if ((! defined ($link)) || ($link eq "")) + { $text = "\\" ; } + else + { + $scriptPng3 .= "#proc rect\n" ; + $scriptPng3 .= " rectangle: 0 $barcnt(s)+0.05 " . @PlotArea {"left"} . " $barcnt(s)-0.05\n" ; + $scriptPng3 .= " color: " . @BackgroundColors {"canvas"} . "\n" ; + $scriptPng3 .= " clickmapurl: " . $link . "\n" ; + if ((defined ($hint)) && ($hint ne "")) + { $scriptPng3 .= " clickmaplabel: " . $hint . "\n" ; } + + $text =~ s/\[+([^\]]*)\]+/$1/ ; + } + $scriptPng2 .= "$text\n" ; + + $barcnt-- ; + } + $scriptPng2 .= "\n" ; + } + + &PlotLines ("front") ; + + $script .= "\n([inc1])\n\n" ; # will be replaced by annotations + $script .= "\n([inc2])\n\n" ; + + + if ($#PlotTextsPng >= 0) + { + foreach $command (@PlotTextsPng) + { + if ($command =~ /^\s*location/) + { $command =~ s/(.*)\[(.*)\](.*)/$1 . ($#Bars - $2 + 2) . $3/xe ; } + + $scriptPng1 .= $command ; + } + $scriptPng1 .= "\n" ; + } + + if ($#PlotTextsSvg >= 0) + { + foreach $command (@PlotTextsSvg) + { + if ($command =~ /^\s*location/) + { $command =~ s/(.*)\[(.*)\](.*)/$1 . ($#Bars - $2 + 2) . $3/xe ; } + + $scriptSvg1 .= $command ; + } + $scriptSvg1 .= "\n" ; + } + +# $script .= "#proc symbol\n" ; +# $script .= " location: 01/01/1943(s) Korea \n" ; +# $script .= " symbol: style=fill shape=downtriangle fillcolor=white radius=0.04\n" ; +# $script .= "\n" ; + + #proc axis + # repeat without grid to get axis on top of bar + # needed because axis may overlap bar slightly + if (defined (@Scales {"Minor"})) + { &PlotScale ("Minor", $false) ; } + if (defined (@Scales {"Major"})) + { &PlotScale ("Major", $false) ; } + + #proc drawcommands + if ($#TextData >= 0) + { + $script .= "#proc drawcommands\n" ; + $script .= " commands:\n" ; + foreach $entry (@TextData) + { $script .= $entry ; } + $script .= "\n" ; + } + + #proc legend + if (defined (@Legend {"orientation"})) + { + if (($#LegendData < 0) && ($Preset eq "")) + { &Error2 ("Command 'Legend' found, but no entries for the legend were specified.\n" . + " Please remove or disable command (disable = put \# before the command)\n" . + " or specify entries for the legend with command 'Colors', attribute 'legend'\n") ; + return ; } + + $perColumn = 999 ; + if (@Legend {"orientation"} =~ /ver/i) + { + if (@Legend {"columns"} > 1) + { + $perColumn = 0 ; + while ((@Legend {"columns"} * $perColumn) < $#LegendData + 1) + { $perColumn ++ ; } + } + } + + for ($l = 1 ; $l <= @Legend {"columns"} ; $l++) + { + $script .= "#proc legend\n" ; + $script .= " noclear: yes\n" ; + if (@Legend {"orientation"} =~ /ver/i) + { $script .= " format: multiline\n" ; } + else + { $script .= " format: singleline\n" ; } + $script .= " seglen: 0.2\n" ; + $script .= " swatchsize: 0.12\n" ; + $script .= " textdetails: size=S\n" ; + $script .= " location: " . (@Legend{"left"}+0.2) . " " . @Legend{"top"} . "\n" ; + $script .= " specifyorder:\n" ; + for ($l2 = 1 ; $l2 <= $perColumn ; $l2++) + { + $category = shift (@LegendData) ; + if (defined ($category)) + { $script .= "$category\n" ; } + } + $script .= "\n" ; + @Legend {"left"} += @Legend {"columnwidth"} ; + } + } + + $script .= "#endproc\n" ; + + print "\nGenerating output:\n" ; + if ( $plcommand ne "" ) + { $pl = $plcommand; } + else + { + $pl = "pl.exe" ; + if ($env eq "Linux") + { $pl = "pl" ; } + } + + print "Using ploticus command \"".$pl."\" (".$plcommand.")\n"; + + $script_save = $script ; + + $script =~ s/\(\[inc1\]\)/$scriptSvg1/ ; + $script =~ s/\(\[inc2\]\)/$scriptSvg2/ ; + $script =~ s/\(\[inc3\]\)// ; + + $script =~ s/textsize XS/textsize 7/gi ; + $script =~ s/textsize S/textsize 8.9/gi ; + + $script =~ s/textsize M/textsize 10.5/gi ; + $script =~ s/textsize L/textsize 13/gi ; + $script =~ s/textsize XL/textsize 17/gi ; + $script =~ s/size=XS/size=7/gi ; + $script =~ s/size=S/size=8.9/gi ; + $script =~ s/size=M/size=10.5/gi ; + $script =~ s/size=L/size=13/gi ; + $script =~ s/size=XL/size=17/gi ; + + + $script =~ s/(\n location:.*)/&ShiftOnePixelForSVG($1)/ge ; + + open "FILE_OUT", ">", $file_script ; + print FILE_OUT &DecodeInput($script) ; + close "FILE_OUT" ; + + $map = ($MapSVG) ? "-map" : ""; + + print "Running Ploticus to generate svg file\n" ; +# my $cmd = "$pl $map -" . "svg" . " -o $file_vector $file_script -tightcrop -font \"Times\"" ; +# my $cmd = "$pl $map -" . "svg" . " -o $file_vector $file_script -tightcrop" ; + my $cmd = EscapeShellArg($pl) . " $map -" . "svg" . " -o " . + EscapeShellArg($file_vector) . " " . EscapeShellArg($file_script) . " -tightcrop" ; + print "$cmd\n"; + system ($cmd) ; + + $script = $script_save ; + $script =~ s/dopagebox: no/dopagebox: yes/ ; + + $script =~ s/\(\[inc1\]\)/$scriptPng1/ ; + $script =~ s/\(\[inc2\]\)/$scriptPng2/ ; + $script =~ s/\(\[inc3\]\)/$scriptPng3/ ; + + $script =~ s/textsize XS/textsize 6/gi ; + $script =~ s/textsize S/textsize 8/gi ; + $script =~ s/textsize M/textsize 10/gi ; + $script =~ s/textsize L/textsize 14/gi ; + $script =~ s/textsize XL/textsize 18/gi ; + $script =~ s/size=XS/size=6/gi ; + $script =~ s/size=S/size=8/gi ; + $script =~ s/size=M/size=10/gi ; + $script =~ s/size=L/size=14/gi ; + $script =~ s/size=XL/size=18/gi ; + + open "FILE_OUT", ">", $file_script ; + print FILE_OUT &DecodeInput($script) ; + close "FILE_OUT" ; + + $map = ($MapPNG && $linkmap) ? "-csmap" : ""; + if ($linkmap && $showmap) + { $map .= " -csmapdemo" ; } + +# $crop = "-crop 0,0," + @ImageSize {"width"} . "," . @ImageSize {"height"} ; + print "Running Ploticus to generate bitmap\n" ; +# $cmd = "$pl $map -" . $fmt . " -o $file_bitmap $file_script -tightcrop" ; # -v $file_bitmap" ; +# $cmd = "$pl $map -" . $fmt . " -o $file_bitmap $file_script -tightcrop -diagfile $file_pl_info -errfile $file_pl_err" ; + $cmd = EscapeShellArg($pl) . " $map -" . $fmt . " -o " . + EscapeShellArg($file_bitmap) . " " . EscapeShellArg($file_script) . " -tightcrop" . + " -mapfile " . EscapeShellArg($file_htmlmap) ; + print "$cmd\n"; + system ($cmd) ; + + if ((-e $file_bitmap) && (-s $file_bitmap > 500 * 1024)) + { + &Error2 ("Output image size exceeds 500 K. Image deleted.\n" . + "Run with option -b (bypass checks) when this is correct.\n") ; + unlink $file_bitmap ; + } ; + + # not for Wikipedia, only for offline use: + if ((-e $file_bitmap) && ($fmt eq "gif")) + { + print "Running nconvert to convert gif image to png format\n\n" ; + print "---------------------------------------------------------------------------\n" ; + $cmd = "nconvert.exe -out png " . EscapeShellArg($file_bitmap) ; + system ($cmd) ; + print "---------------------------------------------------------------------------\n" ; + + if (! (-e $file_png)) + { print "PNG file not created (is nconvert.exe missing?)\n\n" ; } + } + + if (-e $file_htmlmap) # correct click coordinates of right aligned texts (Ploticus bug) + { + open "FILE_IN", "<", $file_htmlmap ; + @map = <FILE_IN> ; + close "FILE_IN" ; + + foreach $line (@map) + { + chomp $line ; + if ($line =~ /\&\&/) + { + $coords = $line ; + $shift = $line ; + $coords =~ s/^.*coords\=\"([^\"]*)\".*$/$1/ ; + $shift =~ s/^.*\&\&([^\"]*)\".*$/$1/ ; + $line =~ s/\&\&[^\"]*// ; + (@updcoords) = split (",", $coords) ; + $maplength = @updcoords [2] - @updcoords [0] ; + @updcoords [0] = @updcoords [0] - 2 * ($maplength-25) ; + @updcoords [2] = @updcoords [0] + $maplength ; + $coordsnew = join (",", @updcoords) ; + $line =~ s/$coords/$coordsnew/ ; + push @map2, $line . "\n" ; + } + else + { push @map2, $line . "\n" ; } + } + + open "FILE_OUT", ">", $file_htmlmap ; + print FILE_OUT @map2 ; + close "FILE_OUT" ; + } + + if (-e $file_vector) + { + open "FILE_IN", "<", $file_vector ; + @svg = <FILE_IN> ; + close "FILE_IN" ; + + foreach $line (@svg) + { + $line =~ s/\{\{(\d+)\}\}x+/@textsSVG[$1]/gxe ; + $line =~ s/\[(\d+)\[ (.*?) \]\d+\]/'<a style="fill:blue;" xlink:href="' . @linksSVG[$1] . '">' . $2 . '<\/a>'/gxe ; + } + + open "FILE_OUT", ">", $file_vector ; + print FILE_OUT @svg ; + close "FILE_OUT" ; + } + + # not for Wikipedia, for offline use: + if ($makehtml) + { + $map = "" ; + if ($linkmap) + { + open "FILE_IN", "<", $file_htmlmap ; + while ($line = <FILE_IN>) + { $map .= $line ; } + close "FILE_IN" ; + } + print "Generating html test file\n" ; + $width = sprintf ("%.0f", @Image {"width"} * 100) ; + $height = sprintf ("%.0f", @Image {"height"} * 100) ; + $html = <<__HTML__ ; + +<html> +<head> +<title>%FILENAME% - EasyTimeline test file</title>\n +</head> + +<body> +<h1><font color="green">EasyTimeline</font> - Test Page</h1> + +<b>Fixed size version (PNG): file $file_png</b><p> +<map name="map1"> +$map</map> + +<!-- +If you want a border simplest way is set <img .. border='1'> +Here tables are used to draw similar borders around both images (border='1' seems not to work for embed tag) +--> + +<table border='1' cellpadding='0' cellspacing='0'><tr><td> +<img src=$file_png usemap='#map1' border='0'> +</td></tr></table> + +<hr> +<b>Scalable version (SVG): file $file_vector</b><p> +<table border='1' cellpadding='0' cellspacing='0'><tr><td> +<noembed>Your browser does not support embedded objects</noembed> +<embed src='$file_vector' name='SVGEmbed' border='1' +width='$width' height='$height' type='image/svg-xml' pluginspage='http://www.adobe.com/svg/viewer/install/'> +</td></tr></table> + +<p>As you can see the scalable version renders fonts smoother better than the bitmap version. +<br>Any SVG picture can also be rescaled or zoomed into, without annoying artefacts. + +<p>Windows users:<br> +<small> Right mouse click on picture for zoom options or</small> +<p><small> Ctrl+click for zoom in</small> +<br><small> Ctrl+Shift+click for zoom out</small> +<br><small> Alt+drag with mouse to move focus</small> + +</body> +</html> + +__HTML__ + + $html =~ s/\%FILENAME\%/$file_name/ ; + + open "FILE_OUT", ">", $file_html ; + print FILE_OUT $html ; + close "FILE_OUT" ; + } +# my $cmd = "\"c:\\\\Program Files\\\\XnView\\\\xnview.exe\"" ; +# system ("\"c:\\\\Program Files\\\\XnView\\\\xnview.exe\"", "d:\\\\Wikipedia\\Perl\\\\Wo2\\\\Test.png") ; +} + +sub WriteTexts +{ + my ($line, $xpos, $ypos) ; + foreach $line (@PlotText) + { + my ($at, $bar, $text, $textcolor, $fontsize, $align, $shift, $link, $hint) = split (",", $line) ; + $text =~ s/\#\%\$/\,/g ; + $link =~ s/\#\%\$/\,/g ; + $hint =~ s/\#\%\$/\,/g ; + $shift =~ s/\#\%\$/\,/g ; + $textcolor =~ s/\#\%\$/\,/g ; + + my $barcnt = 0 ; + for ($b = 0 ; $b <= $#Bars ; $b++) + { + if (lc(@Bars [$b]) eq lc($bar)) + { $barcnt = ($b + 1) ; last ; } + } + + if (@Axis {"time"} eq "x") + { $xpos = "$at(s)" ; $ypos = "[$barcnt](s)" ; } + else + { $ypos = "$at(s)" ; $xpos = "[$barcnt](s)" ; } + + if ($shift ne "") + { + my ($shiftx, $shifty) = split (",", $shift) ; + if ($shiftx > 0) + { $xpos .= "+$shiftx" ; } + if ($shiftx < 0) + { $xpos .= "$shiftx" ; } + if ($shifty > 0) + { $ypos .= "+$shifty" ; } + if ($shifty < 0) + { $ypos .= "$shifty" ; } + } + + &WriteText ("~", $bar, $shiftx, $xpos, $ypos, $text, $textcolor, $fontsize, $align, $link, $hint) ; + } +} + +sub PlotBars +{ + #proc getdata / #proc bars + while ($#PlotBarsNow >= 0) + { + undef @PlotBarsLater ; + + $maxwidth = 0 ; + foreach $entry (@PlotBarsNow) + { + ($width) = split (",", $entry) ; + if ($width > $maxwidth) + { $maxwidth = $width ; } + } + + $script .= "#proc getdata\n" ; + $script .= " delim: comma\n" ; + $script .= " data:\n" ; + + foreach $entry (@PlotBarsNow) + { + my ($width, $bar, $from, $till, $color, $link, $hint) = split (",", $entry) ; + if ($width < $maxwidth) + { + push @PlotBarsLater, $entry ; + next ; + } + for ($b = 0 ; $b <= $#Bars ; $b++) + { + if (lc(@Bars [$b]) eq lc($bar)) + { $bar = ($#Bars - ($b - 1)) ; last ; } + } + if (@Axis {"order"} !~ /reverse/i) + { $entry = "$bar,$from,$till,$color,$link,$hint,\n" ; } + else + { $entry = "$bar," . (-$till) . "," . (-$from) . ",$color,$link,$hint,\n" ; } + + $script .= "$entry" ; + } + $script .= "\n" ; + + #proc bars + $script .= "#proc bars\n" ; + $script .= " axis: " . @Axis {"time"} . "\n" ; + $script .= " barwidth: $maxwidth\n" ; + $script .= " outline: no\n" ; +# $script .= " thinbarline: width=5\n" ; + if (@Axis {"time"} eq "x") + { $script .= " horizontalbars: yes\n" ; } + $script .= " locfield: 1\n" ; + $script .= " segmentfields: 2 3\n" ; + $script .= " colorfield: 4\n" ; +# $script .= " outline: width=1\n" ; +# $script .= " barwidthfield: 5\n" ; +# if (@fields [4] ne "") +# { $script .= " clickmapurl: " . &LinkToUrl ($text) . "\n" ; } +# if (@fields [5] ne "") +# { $script .= " clickmaplabel: $text\n" ; } + $script .= " clickmapurl: \@\@5\n" ; + $script .= " clickmaplabel: \@\@6\n" ; + $script .= "\n" ; + + @PlotBarsNow = @PlotBarsLater ; + } +} + +sub PlotScale +{ + my $scale = shift ; + my $grid = shift ; + my ($color, $from, $till, $start) ; + + %x = %Period ; +# if (($DateFormat =~ /\//) && ($grid)) +# { return ; } + +# if (($DateFormat =~ /\//) +# { +# } + +# if (! $grid) # redefine area, scale linear for time axis, showl whole years always, Ploticus bug +# { + # $from = @Period {"from"} ; + # $till = @Period {"till"} ; + $from = &DateToFloat (@Period {"from"}) ; + $till = &DateToFloat (@Period {"till"}) ; + # $from =~ s/.*\///g ; # delete dd mm if present + # $till =~ s/.*\///g ; + #proc areadef + $script .= "#proc areadef\n" ; + $script .= " #clone: A\n" ; + $script .= " " . @Axis {"time"} . "scaletype: linear\n" ; # date yyyy + + if (@Axis {"order"} !~ /reverse/i) + { $script .= " " . @Axis {"time"} . "range: $from $till\n" ; } + else + { $script .= " " . @Axis {"time"} . "range: " . (-$till) . " " . (-$from) . "\n" ; } + + $script .= "\n" ; +# } + + $script .= "#proc " . @Axis {"time"} . "axis\n" ; + + if (($scale eq "Major") && (! $grid)) + { +# $script .= " stubs: incremental " . @Scales {"Major inc"} . " " . @Scales {"Major unit"} . "\n" ; +# if ($DateFormat =~ /\//) +# { $script .= " stubformat: " . @Axis {"format"} . "\n" ; } +# temp always show whole years (Ploticus autorange bug) + if (@Scales {"Major stubs"} eq "") # ($DateFormat !~ /\//) + { $script .= " stubs: incremental " . @Scales {"Major inc"} . "\n" ; } + else + { $script .= " stubs: list " . @Scales {"Major stubs"} . "\n" ; } + } + else + { $script .= " stubs: none\n" ; } + + if ($DateFormat !~ /\//) +# { $script .= " ticincrement: " . @Scales {"$scale inc"} . " " . @Scales {"$scale unit"} . "\n" ; } + { $script .= " ticincrement: " . @Scales {"$scale inc"} . "\n" ; } + else + { + my $unit = 1 ; + if (@Scales {"$scale unit"} =~ /month/i) + { $unit = 1/12 ; } + if (@Scales {"$scale unit"} =~ /day/i) + { $unit = 1/365 ; } + $script .= " ticincrement: " . @Scales {"$scale inc"} . " $unit\n" ; + } + + if (defined (@Scales {"$scale start"})) + { + $start = @Scales {"$scale start"} ; + # $start =~ s/.*\///g ; # delete dd mm if present + $start = &DateToFloat ($start) ; + if (@Axis {"order"} =~ /reverse/i) + { + $loop = 0 ; + $start = -$start ; + while ($start - @Scales {"$scale inc"} >= - @Period {"till"}) + { + $start -= @Scales {"$scale inc"} ; + if (++$loop > 1000) { last ; } # precaution + } + } + $script .= " stubrange: $start\n" ; + } + + if ($scale eq "Major") + { + $script .= " ticlen: 0.05\n" ; + if (@Axis {"time"} eq "y") + { $script .= " stubdetails: adjust=0.05,0\n" ; } + if (@Axis {"order"} =~ /reverse/i) + { $script .= " signreverse: yes\n" ; } + } + else + { $script .= " ticlen: 0.02\n" ; } +# $script .= " location: 4\n" ; test + + $color .= @Scales {"$scale grid"} ; + + if (defined (@Colors {$color})) + { $color = @Colors {$color} ; } + + if ($grid) + { $script .= " grid: color=$color\n" ; } + + $script .= "\n" ; + + if ($grid) # restore areadef + { + #proc areadef + $script .= "#proc areadef\n" ; + $script .= " #clone: A\n" ; + $script .= "\n" ; + } +} + +sub PlotLines +{ + my $layer = shift ; + + if ($#DrawLines < 0) + { return ; } + + undef (@DrawLinesNow) ; + + foreach $line (@DrawLines) + { + if ($line =~ /\|$layer\n/) + { push @DrawLinesNow, $line ; } + } + + if ($#DrawLinesNow < 0) + { return ; } + + foreach $entry (@DrawLinesNow) + { + chomp ($entry) ; + $script .= "#proc line\n" ; +# $script .= " notation: scaled\n" ; + if ($entry =~ /^[12]/) + { ($mode, $at, $from, $till, $color, $width) = split ('\|', $entry) ; } + else + { ($mode, $points, $color, $width) = split ('\|', $entry) ; } + + $script .= " linedetails: width=$width color=$color style=0\n" ; + + if ($mode == 1) # draw perpendicular to time axis + { + if (@Axis {"order"} =~ /reverse/i) + { $at = -$at ; } + + if (@Axis {"time"} eq "x") + { + if ($from eq "") + { $from = @PlotArea {"bottom"} } + if ($till eq "") + { $till = @PlotArea {"bottom"} + @PlotArea {"height"} } + $from += ($width/200) ; # compensate for overstrechting of thick lines + $till -= ($width/200) ; + if ($from > @Image {"height"}) + { $from = @Image {"height"} ; } + if ($till > @Image {"height"}) + { $till = @Image {"height"} ; } + $script .= " points: $at(s) $from $at(s) $till\n" ; + } + else + { + if ($from eq "") + { $from = @PlotArea {"left"} } + if ($till eq "") + { $till = @PlotArea {"left"} + @PlotArea {"width"} } + $from += ($width/200) ; + $till -= ($width/200) ; + if ($from > @Image {"width"}) + { $from = @Image {"width"} ; } + if ($till > @Image {"width"}) + { $till = @Image {"width"} ; } + $script .= " points: $from $at(s) $till $at(s)\n" ; + } + } + + if ($mode == 2) # draw parralel to time axis + { + if (@Axis {"order"} =~ /reverse/i) + { + $from = -$from ; + $till = -$till ; + } + + $from .= "(s)+" .($width/200) ; + $till .= "(s)-" .($width/200) ; + if (@Axis {"time"} eq "x") + { + if ($at eq "") + { $at = @PlotArea {"bottom"} ; } + if ($at > @Image {"height"}) + { $at = @Image {"height"} ; } + $script .= " points: $from $at $till $at\n" ; + } + else + { + if ($at eq "") + { $at = @PlotArea {"left"} ; } + if ($at > @Image {"width"}) + { $at = @Image {"width"} ; } + $script .= " points: $at $from $at $till\n" ; + } + } + + if ($mode == 3) # draw free line + { + @Points = split (",", $points) ; + foreach $point (@Points) + { $point = &Normalize ($point) ; } + if ((@Points [0] > @Image {"width"}) || + (@Points [1] > @Image {"height"}) || + (@Points [2] > @Image {"width"}) || + (@Points [3] > @Image {"height"})) + { &Error2 ("Linedata attribute 'points' invalid.\n" . + sprintf ("(%d,%d)(%d,%d)", @Points[0]*100, @Points[1]*100, @Points[2]*100, @Points[3]*100) . " does not fit in image\n") ; + return ; } + $script .= " points: @Points[0] @Points[1] @Points[2] @Points[3]\n" ; + } + } + + + $script .= "\n" ; +} + +sub ColorPredefined +{ + my $color = shift ; + if ($color =~ /^(?:black|white|tan1|tan2|red|magenta|claret|coral|pink|orange| + redorange|lightorange|yellow|yellow2|dullyellow|yelloworange| + brightgreen|green|kelleygreen|teal|drabgreen|yellowgreen| + limegreen|brightblue|darkblue|blue|oceanblue|skyblue| + purple|lavender|lightpurple|powderblue|powderblue2)$/xi) + { + if (! defined (@Colors {lc ($color)})) + { &StoreColor ($color, $color, "", $command) ; } + return ($true) ; + } + else + { return ($false) ; } +} + +sub ValidAbs +{ + $value = shift ; + if ($value =~ /^ \d+ \.? \d* (?:px|in|cm)? $/xi) + { return ($true) ; } + else + { return ($false) ; } +} + +sub ValidAbsRel +{ + $value = shift ; + if ($value =~ /^ \d+ \.? \d* (?:px|in|cm|$hPerc)? $/xi) + { return ($true) ; } + else + { return ($false) ; } +} + +sub ValidDateFormat +{ + my $date = shift ; + my ($day, $month, $year) ; + +# if ($date=~ /^\-?\d+$/) # for now full years are always allowed +# { return ($true) ; } + + if ($DateFormat eq "yyyy") + { + if (! ($date=~ /^\-?\d+$/)) + { return ($false) ; } + return ($true) ; + } + + if ($DateFormat eq "x.y") + { + if (! ($date=~ /^\-?\d+(?:\.\d+)?$/)) + { return ($false) ; } + return ($true) ; + } + + if (! ($date=~ /^\d\d\/\d\d\/\d\d\d\d$/)) + { return ($false) ; } + + if ($DateFormat eq "dd/mm/yyyy") + { + $day = substr ($date,0,2) ; + $month = substr ($date,3,2) ; + $year = substr ($date,6,4) ; + } + else + { + $day = substr ($date,3,2) ; + $month = substr ($date,0,2) ; + $year = substr ($date,6,4) ; + } + + if ($month =~ /^(?:01|03|05|07|08|10|12)$/) + { if ($day > 31) { return ($false) ; }} + elsif ($month =~ /^(?:04|06|09|11)$/) + { if ($day > 30) { return ($false) ; }} + elsif ($month =~ /^02$/) + { + if (($year % 4 == 0) && ($year % 100 != 0)) + { if ($day > 29) { return ($false) ; }} + else + { if ($day > 28) { return ($false) ; }} + } + else { return ($false) ; } + return ($true) ; +} + +sub ValidDateRange +{ + my $date = shift ; + my ($day, $month, $year, + $dayf, $monthf, $yearf, + $dayt, $montht, $yeart) ; + + my $from = @Period {"from"} ; + my $till = @Period {"till"} ; + + if (($DateFormat eq "yyyy") || ($DateFormat eq "x.y")) + { + if (($date < $from) || ($date > $till)) + { return ($false) ; } + return ($true) ; + } + + if ($DateFormat eq "dd/mm/yyyy") + { + $day = substr ($date,0,2) ; + $month = substr ($date,3,2) ; + $year = substr ($date,6,4) ; + $dayf = substr ($from,0,2) ; + $monthf = substr ($from,3,2) ; + $yearf = substr ($from,6,4) ; + $dayt = substr ($till,0,2) ; + $montht = substr ($till,3,2) ; + $yeart = substr ($till,6,4) ; + } + if ($DateFormat eq "mm/dd/yyyy") + { + $day = substr ($date,3,2) ; + $month = substr ($date,0,2) ; + $year = substr ($date,6,4) ; + $dayf = substr ($from,3,2) ; + $monthf = substr ($from,0,2) ; + $yearf = substr ($from,6,4) ; + $dayt = substr ($till,3,2) ; + $montht = substr ($till,0,2) ; + $yeart = substr ($till,6,4) ; + } + + if (($year < $yearf) || + (($year == $yearf) && + (($month < $monthf) || + (($month == $monthf) && ($day < $dayf)) + ))) + { return ($false) } + + if (($year > $yeart) || + (($year == $yeart) && + (($month > $montht) || + (($month == $montht) && ($day > $dayt)) + ))) + { return ($false) } + + return ($true) ; +} + +sub DateMedium +{ + my $from = shift ; + my $till = shift ; + + if (($DateFormat eq "yyyy") || ($DateFormat eq "x.y")) + { return (sprintf ("%.3f", ($from + $till) / 2)) ; } + + $from2 = &DaysFrom1800 ($from) ; + $till2 = &DaysFrom1800 ($till) ; + my $date = &DateFrom1800 (int (($from2 + $till2) / 2)) ; + return ($date) ; +} + +sub DaysFrom1800 +{ + @mmm = (31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) ; + my $date = shift ; + if ($DateFormat eq "dd/mm/yyyy") + { + $day = substr ($date,0,2) ; + $month = substr ($date,3,2) ; + $year = substr ($date,6,4) ; + } + else + { + $day = substr ($date,3,2) ; + $month = substr ($date,0,2) ; + $year = substr ($date,6,4) ; + } + if ($year < 1800) + { &Error2 ("Function 'DaysFrom1800' expects year >= 1800, not '$year'.") ; return ; } + + $days = ($year - 1800) * 365 ; + $days += int (($year -1 - 1800) / 4) ; + $days -= int (($year -1 - 1800) / 100) ; + if ($month > 1) + { + for ($m = $month - 2 ; $m >= 0 ; $m--) + { + $days += @mmm [$m] ; + if ($m == 1) + { + if ((($year % 4) == 0) && (($year % 100) != 0)) + { $days ++ ; } + } + } + } + $days += $day ; + + return ($days) ; +} + +sub DateToFloat +{ + my $date = shift ; + if ($DateFormat !~ /\//) + { return ($date) ; } + my $year = $date ; + $year =~ s/.*\///g ; # delete dd mm/mm dd + my $fraction = (&DaysFrom1800 ($date) - &DaysFrom1800 ("01/01/" . $year)) / 365.25 ; + return ($year + $fraction) ; +} + +sub DateFrom1800 +{ + my $days = shift ; + + @mmm = (31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) ; + + $year = 1800 ; + while ($days > 365 + (($year % 4) == 0)) + { + if ((($year % 4) == 0) && (($year % 100) != 0)) + { $days -= 366 ; } + else + { $days -= 365 ; } + $year ++ ; + } + + $month = 0 ; + while ($days > @mmm [$month]) + { + $days -= @mmm [$month] ; + if ($month == 1) + { + if ((($year % 4) == 0) && (($year % 100) != 0)) + { $days -- ; } ; + } + $month++ ; + } + $day = $days ; + + $month ++ ; + if ($DateFormat eq "dd/mm/yyyy") + { $date = sprintf ("%02d/%02d/%04d", $day, $month, $year) ; } + else + { $date = sprintf ("%02d/%02d/%04d", $month, $day, $year) ; } + + return ($date) ; +} + +sub ExtractText +{ + my $data = shift ; + my $data2 = $data ; + my $text = "" ; + + # special case: allow embedded spaces when 'text' is last attribute +# $data2 =~ s/\:\:/\@\#\!/g ; + if ($data2 =~ /text\:[^\:]+$/) + { + $text = $data2 ; + $text =~ s/^.*?text\:// ; +# $text =~ s/^\s(.*?)\s*$/$1/ ; ?? -> + $text =~ s/^(.*?)\s*$/$1/ ; + $text =~ s/\\n/\n/g ; + $text =~ s/\"\"/\@\#\$/g ; + $text =~ s/\"//g ; + $text =~ s/\@\#\$/"/g ; + $data2 =~ s/text\:.*$// ; + } + + # extract text between double quotes + $data2 =~ s/\"\"/\@\#\$/g ; + if ($data2 =~ /text\:\s*\"/) + { + $text = $data2 ; + $text =~ s/^.*?text\:\s*\"// ; + + if (! ($text =~ /\"/)) + { &Error ("PlotData invalid. Attribute 'text': no closing \" found.") ; + return ("x", "x") ; } + + $text =~ s/\".*$//; + $text =~ s/\@\#\$/"/g ; + $text =~ s/\\n/\n/g ; + } + $data2 =~ s/text\:\s*\"[^\"]*\"// ; + $data2 =~ s/\@\#\$/"/g ; + return ($data2, $text) ; +} + +sub ParseText +{ + my $text = shift ; + $text =~ s/\_\_/\@\#\$/g ; + $text =~ s/\_/ /g ; + $text =~ s/\@\#\$/_/g ; + + $text =~ s/\~\~/\@\#\$/g ; + $text =~ s/\~/\\n/g ; + $text =~ s/\@\#\$/~/g ; + + return ($text) ; +} + +sub BarDefined +{ + my $bar = shift ; + foreach $bar2 (@Bars) + { + if (lc ($bar2) eq lc ($bar)) + { return ($true) ; } + } + +# not part of barset ? return + if ($bar != /\#\d+$/) + { return ($false) ; } + +# find previous bar in barset + my $barcnt = $bar ; + my $barid = $bar ; + $barcnt =~ s/.*\#(\d+$)/$1/ ; + $barid =~ s/(.*\#)\d+$/$1/ ; + $barcnt -- ; + $a = $#Bars ; + for (my $b = 0 ; $b <= $#Bars ; $b++) + { + if (lc (@Bars [$b]) eq lc ($barid . $barcnt)) + { + $b++ ; + for (my $b2 = $#Bars + 1 ; $b2 > $b ; $b2--) + { @Bars [$b2] = @Bars [$b2-1]; } + @Bars [$b] = lc ($bar) ; + @BarLegend {lc ($bar)} = " " ; + return ($true) ; + } + } + return ($false) ; +} + +sub ValidAttributes +{ + my $command = shift ; + + if ($command =~ /^BackgroundColors$/i) + { return (CheckAttributes ($command, "", "canvas,bars")) ; } + + if ($command =~ /^BarData$/i) +# { return (CheckAttributes ($command, "", "bar,barset,barcount,link,text")) ; } + { return (CheckAttributes ($command, "", "bar,barset,link,text")) ; } + + if ($command =~ /^Colors$/i) + { return (CheckAttributes ($command, "id,value", "legend")) ; } + + if ($command =~ /^ImageSize$/i) + { return (CheckAttributes ($command, "", "width,height,barincrement")) ; } + + if ($command =~ /^Legend$/i) + { return (CheckAttributes ($command, "", "columns,columnwidth,orientation,position,left,top")) ; } + + if ($command =~ /^LineData$/i) + { return (CheckAttributes ($command, "", "at,from,till,atpos,frompos,tillpos,points,color,layer,width")) ; } + + if ($command =~ /^Period$/i) + { return (CheckAttributes ($command, "from,till", "")) ; } + + if ($command =~ /^PlotArea$/i) + { return (CheckAttributes ($command, "", "left,bottom,width,height,right,top")) ; } + + if ($command =~ /^PlotData$/i) + { return (CheckAttributes ($command, "", "align,anchor,at,bar,barset,color,fontsize,from,link,mark,shift,text,textcolor,till,width")) ; } + + if ($command =~ /^Scale/i) + { return (CheckAttributes ($command, "increment,start", "unit,grid,gridcolor,text")) ; } + + if ($command =~ /^TextData$/i) + { return (CheckAttributes ($command, "", "fontsize,lineheight,link,pos,tabs,text,textcolor")) ; } + + if ($command =~ /^TimeAxis$/i) + { return (CheckAttributes ($command, "", "orientation,format,order")) ; } + + return ($true) ; +} + +sub CheckAttributes +{ + my $name = shift ; + my @Required = split (",", shift) ; + my @Allowed = split (",", shift) ; + + my $attribute ; + my %Attributes2 = %Attributes ; + + $hint = "\nSyntax: '$name =" ; + foreach $attribute (@Required) + { $hint .= " $attribute:.." ; } + foreach $attribute (@Allowed) + { $hint .= " [$attribute:..]" ; } + $hint .= "'" ; + + foreach $attribute (@Required) + { + if ((! defined (@Attributes {$attribute})) || (@Attributes {$attribute} eq "")) + { &Error ("$name definition incomplete. $hint") ; + undef (@Attributes) ; return ($false) ; } + delete (@Attributes2 {$attribute}) ; + } + foreach $attribute (@Allowed) + { delete (@Attributes2 {$attribute}) ; } + + @AttrKeys = keys %Attributes2 ; + if ($#AttrKeys >= 0) + { + if (@AttrKeys [0] eq "single") + { &Error ("$name definition invalid. Specify all attributes as name:value pairs.") ; } + else + { &Error ("$name definition invalid. Invalid attribute '" . @AttrKeys [0] . "' found. $hint") ; } + undef (@Attributes) ; return ($false) ; } + + return ($true) ; +} + +sub CheckPreset +{ + my $command = shift ; + my ($preset, $action, $attrname, $attrvalue) ; + + my $newcommand = $true ; + my $addvalue = $true ; + if ($command =~ /^$prevcommand$/i) + { $newcommand = $false ; } + if ((! $newcommand) && ($command =~ /^(?:DrawLines|PlotData|TextData)$/i)) + { $addvalue = $false ; } + $prevcommand = $command ; + + foreach $preset (@PresetList) + { + if ($preset =~ /^$command\|/i) + { + ($command, $action, $attrname, $attrpreset) = split ('\|', $preset) ; + if ($attrname eq "") + { $attrname = "single" ; } + + $attrvalue = @Attributes {$attrname} ; + + if (($action eq "-") && ($attrvalue ne "")) + { + if ($attrname eq "single") + { &Error ("Chosen preset makes this command redundant.\n" . + " Please remove this command.") ; } + else + { &Error ("Chosen preset conflicts with '$attrname:...'.\n" . + " Please remove this attribute.") ; } + @Attributes {$attrname} = "" ; + } + + if (($action eq "+") && ($attrvalue eq "")) + { + if ($addvalue) + { @Attributes {$attrname} = $attrpreset ; } + } + + if (($action eq "=") && ($attrvalue eq "")) + { @Attributes {$attrname} = $attrpreset ; } + + if (($action eq "=") && ($attrvalue ne "") && + ($attrvalue !~ /$attrpreset/i)) + { + if ($attrname eq "single") + { &Error ("Conflicting settings.\nPreset defines '$attrpreset'.") ; } + else + { &Error ("Conflicting settings.\nPreset defines '$attrname:$attrpreset'.") ; } + @Attributes {$attrname} = $attrpreset ; + } + } + } +} + +sub ShiftOnePixelForSVG +{ + my $line = shift ; + $line =~ s/location:\s*// ; + my ($posx, $posy) = split (" ", $line) ; + + if ($posy =~ /\+/) + { ($posy1, $posy2) = split ('\+', $posy) ; } + elsif ($posy =~ /.+\-/) + { + if ($posy =~ /^\-/) + { + ($sign, $posy1, $posy2) = split ('\-', $posy) ; $posy2 = - $posy2 ; + $posy1 = "-" . $posy1 ; + } + else + { ($posy1, $posy2) = split ('\-', $posy) ; $posy2 = - $posy2 } + } + else + { $posy1 = $posy ; $posy2 = 0 ; } + + if ($posy1 !~ /(s)/) + { $posy += 0.01 ; } + else + { + $posy2 += 0.01 ; + if ($posy2 == 0) + { $posy = $posy1 ; } + elsif ($posy2 < 0) + { $posy = $posy1 . "$posy2" ; } + else + { $posy = $posy1 . "+" . $posy2 ; } + } + + $line = "\n location: $posx $posy" ; + return ($line) ; +} + +sub NormalizeURL +{ + my $url = shift ; + $url =~ s/(https?)\:?\/?\/?/$1:\/\// ; # add possibly missing special characters + $url =~ s/ /%20/g ; + return ($url) ; +} + +# wiki style link may include linebreak characters -> split into several wiki links +sub NormalizeWikiLink +{ + my $text = shift ; + + my $brdouble = $false ; + if ($text =~ /\[\[.*\]\]/) + { $brdouble = $true ; } + + $text =~ s/\[\[?// ; + $text =~ s/\]?\]// ; + + my ($hide,$show) = split ('\|', $text) ; + if ($show eq "") + { $show = $hide ; } + $hide =~ s/\s*\n\s*/ /g ; + + my @Show = split ("\n", $show) ; + $text = "" ; + foreach $part (@Show) + { + if ($brdouble) + { $part = "[[" . $hide . "|" . $part . "]]" ; } + else + { $part = "[" . $hide . "|" . $part . "]" ; } + } + $text = join ("\n", @Show) ; + + return ($text) ; +} + +sub ProcessWikiLink +{ + my $text = shift ; + my $link = shift ; + my $hint = shift ; + my $wikilink = $false ; + + chomp ($text) ; + chomp ($link) ; + chomp ($hint) ; + + my ($wiki, $title) ; + if ($link ne "") # ignore wiki brackets in text when explicit link is specified + { + $text =~ s/\[\[ [^\|]+ \| (.*) \]\]/$1/gx ; + $text =~ s/\[\[ [^\:]+ \: (.*) \]\]/$1/gx ; +# $text =~ s/\[\[ (.*) \]\]/$1/gx ; + } + else + { + if ($text =~ /\[.+\]/) # keep first link in text segment, remove others + { + $link = $text ; + $link =~ s/\n//g ; + $link =~ s/^[^\[\]]*\[/[/x ; + + if ($link =~ /^\[\[/) + { $wikilink = $true ; } + + $link =~ s/^ [^\[]* \[+ ([^\[\]]*) \].*$/$1/x ; + $link =~ s/\|.*$// ; + if ($wikilink) + { $link = "[[" . $link . "]]" ; } + + $text =~ s/(\[+) [^\|\]]+ \| ([^\]]*) (\]+)/$1$2$3/gx ; + $text =~ s/(https?)\:/$1colon/gx ; +# $text =~ s/(\[+) [^\:\]]+ \: ([^\]]*) (\]+)/$1$2$3/gx ; #??? + + # remove interwiki link prefix + $text =~ s/(\[+) (?:.{2,3}|(?:zh\-.*)|simple|minnan|tokipona) \: ([^\]]*) (\]+)/$1$2$3/gxi ; #??? + + $text =~ s/\[+ ([^\]]+) \]+/{{{$1}}}/x ; + $text =~ s/\[+ ([^\]]+) \]+/$1/gx ; + $text =~ s/\{\{\{ ([^\}]*) \}\}\}/[[$1]]/x ; + } +# if ($text =~ /\[\[.+\]\]/) +# { +# $wikilink = $true ; +# $link = $text ; +# $link =~ s/\n//g ; +# $link =~ s/^.*?\[\[/[[/x ; +# $link =~ s/\| .*? \]\].*$/]]/x ; +# $link =~ s/\]\].*$/]]/x ; +# $text =~ s/\[\[ [^\|\]]+ \| (.*?) \]\]/[[$1]]/x ; +# $text =~ s/\[\[ [^\:\]]+ \: (.*?) \]\]/[[$1]]/x ; + +# # remove remaining links +# $text =~ s/\[\[ ([^\]]+) \]\]/^%#$1#%^/x ; +# $text =~ s/\[+ ([^\]]+) \]+/$1/gx ; +# $text =~ s/\^$hPerc\# (.*?) \#$hPerc\^/[[$1]]/x ; +# } +# elsif ($text =~ /\[.+\]/) +# { +# $link = $text ; +# $link =~ s/\n//g ; +# $link =~ s/^.*?\[/[/x ; +# $link =~ s/\| .*? \].*$/]/x ; +# $link =~ s/\].*$/]/x ; +# $link =~ s/\[ ([^\]]+) \]/$1/x ; +# $text =~ s/\[ [^\|\]]+ \| (.*?) \]/[[$1]]/x ; + +# # remove remaining links +# $text =~ s/\[\[ ([^\]]+) \]\]/^%#$1#%^/x ; +# $text =~ s/\[+ ([^\]]+) \]+/$1/gx ; +# $text =~ s/\^$hPerc\# (.*?) \#$hPerc\^/[[$1]]/x ; +## $text =~ s/\[\[ (.*) \]\]/$1/gx ; +# } + + } + + if ($wikilink) + { +# if ($link =~ /^\[\[.+\:.+\]\]$/) # has a colon in its name + if ($link =~ /^\[\[ (?:.{2,3}|(?:zh\-.*)|simple|minnan|tokipona) \: .+\]\]$/xi) # has a interwiki link prefix + { + # This will fail for all interwiki links other than Wikipedia. + $wiki = lc ($link) ; + $title = $link ; + $wiki =~ s/\[\[([^\:]+)\:.*$/$1/x ; + $title =~ s/^[^\:]+\:(.*)\]\]$/$1/x ; + $title =~ s/ /_/g ; + $link = "http://$wiki.wikipedia.org/wiki/$title" ; + $link = &EncodeURL ($title) ; + if (($hint eq "") && ($title ne "")) + { $hint = "$wiki: $title" ; } + } + else + { + # $wiki = "en" ; + $title = $link ; + $title =~ s/^\[\[(.*)\]\]$/$1/x ; + $title =~ s/ /_/g ; + $link = $articlepath ; + $urlpart = &EncodeURL ($title) ; + $link =~ s/\$1/$urlpart/ ; + if (($hint eq "") && ($title ne "")) + { $hint = "$title" ; } + } + $hint =~ s/_/ /g ; + } + else + { + if ($link ne "") + { $hint = &ExternalLinkToHint ($link) ; } + } + + if (($link ne "") && ($text !~ /\[\[/) && ($text !~ /\]\]/)) + { $text = "[[" . $text . "]]" ; } + + $hint = &EncodeHtml ($hint) ; + return ($text, $link, $hint) ; +} + +sub ExternalLinkToHint +{ + my $hint = shift ; + $hint =~ s/^https?\:?\/?\/?// ; + $hint =~ s/\/.*$// ; + return (&EncodeHtml ($hint . "/..")) ; +} + +sub EncodeInput +{ + my $text = shift ; + # revert encoding of '<' & '>' by MediaWiki + $text =~ s/\<\;/\</g ; + $text =~ s/\>\;/\>/g ; + $text =~ s/([\`\{\}\%\&\@\$\(\)\;\=])/"%" . sprintf ("%X", ord($1)) . "%";/ge ; + return ($text) ; +} + +sub DecodeInput +{ + my $text = shift ; + $text =~ s/\%([0-9A-F]{2})\%/chr(hex($1))/ge ; + return ($text) ; +} + +sub EncodeHtml +{ + my $text = shift ; + $text =~ s/([\<\>\&\'\"])/"\&\#" . ord($1) . "\;"/ge ; + $text =~ s/\n/<br>/g ; + return ($text) ; +} + +sub EncodeURL +{ + my $url = shift ; + # For some reason everything gets run through this weird internal + # encoding that's similar to URL-encoding. Armor against this as well, + # or else adjacent encoded bytes will be corrupted. + $url =~ s/([^0-9a-zA-Z\%\:\/\._])/"%25%".sprintf ("%02X",ord($1))/ge ; + return ($url) ; +} + +sub Error +{ + my $msg = &DecodeInput(shift) ; + $msg =~ s/\n\s*/\n /g ; # indent consecutive lines + + $CntErrors++ ; + if (! $listinput) + { push @Errors, "Line $LineNo: " . &DecodeInput($Line) . "\n" ; } + push @Errors, "- $msg\n\n" ; + if ($CntErrors > 10) + { &Abort ("More than 10 errors found") ; } +} + +sub Error2 +{ + my $msg = &DecodeInput(shift) ; + $msg =~ s/\n\s*/\n /g ; # indent consecutive lines + $CntErrors++ ; + push @Errors, "- $msg\n" ; +} + +sub Warning +{ + my $msg = &DecodeInput(shift) ; + $msg =~ s/\n\s*/\n /g ; # indent consecutive lines + if (! $listinput) + { push @Warnings, "Line $LineNo: " . &DecodeInput ($Line) . "\n" ; } + push @Warnings, "- $msg\n\n" ; +} + +sub Warning2 +{ + my $msg = &DecodeInput(shift) ; + $msg =~ s/\n\s*/\n /g ; # indent consecutive lines + push @Warnings, "- $msg\n" ; +} + +sub Info +{ + my $msg = &DecodeInput(shift) ; + $msg =~ s/\n\s*/\n /g ; # indent consecutive lines + if (! $listinput) + { push @Info, "Line $LineNo: " . &DecodeInput ($Line) . "\n" ; } + push @Info, "- $msg\n\n" ; +} + +sub Info2 +{ + my $msg = &DecodeInput(shift) ; + $msg =~ s/\n\s*/\n /g ; # indent consecutive lines + push @Info, "- $msg\n" ; +} + +sub Abort +{ + my $msg = &DecodeInput(shift) ; + + print "\n\n***** " . $msg . " *****\n\n" ; + print @Errors ; + print "Execution aborted.\n" ; + + open "FILE_OUT", ">", $file_errors ; + print FILE_OUT "<p>EasyTimeline $version</p><p><b>Timeline generation failed: " . &EncodeHtml ($msg) ."</b></p>\n" ; + foreach $line (@Errors) + { print FILE_OUT &EncodeHtml ($line) . "\n" ; } + close "FILE_OUT" ; + + if ($makehtml) # generate html test file, which would normally contain png + svg (+ image map) + { + open "FILE_IN", "<", $file_errors ; + open "FILE_OUT", ">", $file_html ; + print FILE_OUT "<html><head>\n<title>Graphical Timelines - HTML test file</title>\n</head>\n" . + "<body><h1><font color='green'>EasyTimeline</font> - Test Page</h1>\n\n" . + "<code>\n" ; + print FILE_OUT <FILE_IN> ; + print FILE_OUT "</code>\n\n</body>\n</html>" ; + close "FILE_IN" ; + close "FILE_OUT" ; + } + exit ; +} + +sub EscapeShellArg +{ + my $arg = shift; + if ($env eq "Linux") { + $arg =~ s/'/\\'/; + $arg = "'$arg'"; + } else { + $arg =~ s/"/\\"/; + $arg = "\"$arg\""; + } + return $arg; +} + +# vim: set sts=2 ts=2 sw=2 et : + +sub UnicodeToAscii { + my $unicode = shift ; + my $char = substr ($unicode,0,1) ; + my $ord = ord ($char) ; + + if ($ord < 128) # plain ascii character + { return ($unicode) ; } # (will not occur in this script) + else + { + # for completeness sake complete routine, only 2 byte unicodes sent here + if ($ord >= 252) + { $value = $ord - 252 ; } + elsif ($ord >= 248) + { $value = $ord - 248 ; } + elsif ($ord >= 240) + { $value = $ord - 240 ; } + elsif ($ord >= 224) + { $value = $ord - 224 ; } + else + { $value = $ord - 192 ; } + for ($c = 1 ; $c < length ($unicode) ; $c++) + { $value = $value * 64 + ord (substr ($unicode, $c,1)) - 128 ; } + +# $html = "\&\#" . $value . ";" ; any unicode can be specified as html char + + if (($value >= 128) && ($value <= 255)) + { return (chr ($value)) ; } + else + { return "?" ; } + } +} + diff --git a/mwlib/Makefile b/mwlib/Makefile new file mode 100644 index 0000000..6f244ef --- /dev/null +++ b/mwlib/Makefile @@ -0,0 +1,20 @@ +RE2C = re2c -w --no-generation-date + +all: _expander.cc _mwscan.cc _mwscan.so _expander.so + +_expander.so: _expander.cc + (cd .. && python ./setup.py build_ext --inplace build) + +_mwscan.so: _mwscan.cc + (cd .. && python ./setup.py build_ext --inplace build) + +_expander.cc: _expander.re + $(RE2C) -o _expander.cc _expander.re + +_mwscan.cc: _mwscan.re + $(RE2C) -o _mwscan.cc _mwscan.re + +clean:: + rm -rf *.pyc *~ *.so build a.out + + diff --git a/mwlib/__init__.py b/mwlib/__init__.py new file mode 100755 index 0000000..8088807 --- /dev/null +++ b/mwlib/__init__.py @@ -0,0 +1,6 @@ + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +#import pkg_resources +#pkg_resources.declare_namespace("mwlib") diff --git a/mwlib/_expander.cc b/mwlib/_expander.cc new file mode 100644 index 0000000..9641ae1 --- /dev/null +++ b/mwlib/_expander.cc @@ -0,0 +1,826 @@ +/* Generated by re2c 0.13.4 */ +#line 1 "_expander.re" +// -*- mode: c++ -*- +// Copyright (c) 2007-2008 PediaPress GmbH +// See README.txt for additional licensing information. + +#include <Python.h> + +#include <iostream> +#include <assert.h> +#include <vector> + +using namespace std; + +#define RET(x) {found(x); return x;} + +struct Token +{ + int type; + int start; + int len; +}; + + +class MacroScanner +{ +public: + + MacroScanner(Py_UNICODE *_start, Py_UNICODE *_end) { + source = start = _start; + end = _end; + cursor = start; + } + + int found(int val) { + if (val==5 && tokens.size()) { + Token &previous_token (tokens[tokens.size()-1]); + if (previous_token.type==val) { + previous_token.len += cursor-start; + return tokens.size()-1; + } + } + Token t; + t.type = val; + t.start = (start-source); + t.len = cursor-start; + tokens.push_back(t); + return tokens.size()-1; + } + + inline int scan(); + + Py_UNICODE *source; + + Py_UNICODE *start; + Py_UNICODE *cursor; + Py_UNICODE *end; + vector<Token> tokens; +}; + + +int MacroScanner::scan() +{ + +std: + + start=cursor; + + Py_UNICODE *marker=cursor; + + Py_UNICODE *save_cursor = cursor; + + +#define YYCTYPE Py_UNICODE +#define YYCURSOR cursor +#define YYMARKER marker +#define YYLIMIT (end) +// #define YYFILL(n) return 0; + +#line 80 "_expander.re" + + + + + +#line 87 "_expander.cc" +{ + YYCTYPE yych; + + yych = *YYCURSOR; + if (yych <= '\\') { + if (yych <= '<') { + if (yych <= 0x0000) goto yy10; + if (yych <= ';') goto yy12; + goto yy9; + } else { + if (yych == '[') goto yy5; + goto yy12; + } + } else { + if (yych <= '{') { + if (yych <= ']') goto yy6; + if (yych <= 'z') goto yy12; + } else { + if (yych <= '|') goto yy7; + if (yych <= '}') goto yy4; + goto yy12; + } + } + ++YYCURSOR; + if ((yych = *YYCURSOR) == '{') goto yy78; +yy3: +#line 99 "_expander.re" + {RET(5);} +#line 116 "_expander.cc" +yy4: + yych = *++YYCURSOR; + if (yych == '}') goto yy75; + goto yy3; +yy5: + yych = *++YYCURSOR; + if (yych == '[') goto yy73; + goto yy3; +yy6: + yych = *++YYCURSOR; + if (yych == ']') goto yy73; + goto yy3; +yy7: + ++YYCURSOR; +#line 88 "_expander.re" + {RET(6);} +#line 133 "_expander.cc" +yy9: + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 'M') { + if (yych <= 'G') { + if (yych == '!') goto yy13; + if (yych <= 'F') goto yy3; + goto yy15; + } else { + if (yych == 'I') goto yy17; + if (yych <= 'L') goto yy3; + goto yy16; + } + } else { + if (yych <= 'h') { + if (yych <= 'N') goto yy18; + if (yych == 'g') goto yy15; + goto yy3; + } else { + if (yych <= 'l') { + if (yych <= 'i') goto yy17; + goto yy3; + } else { + if (yych <= 'm') goto yy16; + if (yych <= 'n') goto yy18; + goto yy3; + } + } + } +yy10: + ++YYCURSOR; +#line 98 "_expander.re" + {RET(0);} +#line 166 "_expander.cc" +yy12: + yych = *++YYCURSOR; + goto yy3; +yy13: + yych = *++YYCURSOR; + if (yych == '-') goto yy60; +yy14: + YYCURSOR = YYMARKER; + goto yy3; +yy15: + yych = *++YYCURSOR; + if (yych == 'A') goto yy51; + if (yych == 'a') goto yy51; + goto yy14; +yy16: + yych = *++YYCURSOR; + if (yych == 'A') goto yy45; + if (yych == 'a') goto yy45; + goto yy14; +yy17: + yych = *++YYCURSOR; + if (yych == 'M') goto yy35; + if (yych == 'm') goto yy35; + goto yy14; +yy18: + yych = *++YYCURSOR; + if (yych == 'O') goto yy19; + if (yych != 'o') goto yy14; +yy19: + yych = *++YYCURSOR; + if (yych <= 'W') { + if (yych == 'I') goto yy21; + if (yych <= 'V') goto yy14; + } else { + if (yych <= 'i') { + if (yych <= 'h') goto yy14; + goto yy21; + } else { + if (yych != 'w') goto yy14; + } + } + yych = *++YYCURSOR; + if (yych == 'I') goto yy30; + if (yych == 'i') goto yy30; + goto yy14; +yy21: + yych = *++YYCURSOR; + if (yych == 'N') goto yy22; + if (yych != 'n') goto yy14; +yy22: + yych = *++YYCURSOR; + if (yych == 'C') goto yy23; + if (yych != 'c') goto yy14; +yy23: + yych = *++YYCURSOR; + if (yych == 'L') goto yy24; + if (yych != 'l') goto yy14; +yy24: + yych = *++YYCURSOR; + if (yych == 'U') goto yy25; + if (yych != 'u') goto yy14; +yy25: + yych = *++YYCURSOR; + if (yych == 'D') goto yy26; + if (yych != 'd') goto yy14; +yy26: + yych = *++YYCURSOR; + if (yych == 'E') goto yy27; + if (yych != 'e') goto yy14; +yy27: + yych = *++YYCURSOR; + if (yych != '>') goto yy14; + ++YYCURSOR; +#line 90 "_expander.re" + {goto noinclude;} +#line 242 "_expander.cc" +yy30: + yych = *++YYCURSOR; + if (yych == 'K') goto yy31; + if (yych != 'k') goto yy14; +yy31: + yych = *++YYCURSOR; + if (yych == 'I') goto yy32; + if (yych != 'i') goto yy14; +yy32: + yych = *++YYCURSOR; + if (yych != '>') goto yy14; + ++YYCURSOR; +#line 91 "_expander.re" + {goto nowiki;} +#line 257 "_expander.cc" +yy35: + yych = *++YYCURSOR; + if (yych == 'A') goto yy36; + if (yych != 'a') goto yy14; +yy36: + yych = *++YYCURSOR; + if (yych == 'G') goto yy37; + if (yych != 'g') goto yy14; +yy37: + yych = *++YYCURSOR; + if (yych == 'E') goto yy38; + if (yych != 'e') goto yy14; +yy38: + yych = *++YYCURSOR; + if (yych == 'M') goto yy39; + if (yych != 'm') goto yy14; +yy39: + yych = *++YYCURSOR; + if (yych == 'A') goto yy40; + if (yych != 'a') goto yy14; +yy40: + yych = *++YYCURSOR; + if (yych == 'P') goto yy41; + if (yych != 'p') goto yy14; +yy41: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '<') { + if (yych <= 0x0000) goto yy14; + if (yych <= ';') goto yy41; + goto yy14; + } else { + if (yych != '>') goto yy41; + } + ++YYCURSOR; +#line 92 "_expander.re" + {goto imagemap;} +#line 295 "_expander.cc" +yy45: + yych = *++YYCURSOR; + if (yych == 'T') goto yy46; + if (yych != 't') goto yy14; +yy46: + yych = *++YYCURSOR; + if (yych == 'H') goto yy47; + if (yych != 'h') goto yy14; +yy47: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '<') { + if (yych <= 0x0000) goto yy14; + if (yych <= ';') goto yy47; + goto yy14; + } else { + if (yych != '>') goto yy47; + } + ++YYCURSOR; +#line 93 "_expander.re" + {goto math;} +#line 317 "_expander.cc" +yy51: + yych = *++YYCURSOR; + if (yych == 'L') goto yy52; + if (yych != 'l') goto yy14; +yy52: + yych = *++YYCURSOR; + if (yych == 'L') goto yy53; + if (yych != 'l') goto yy14; +yy53: + yych = *++YYCURSOR; + if (yych == 'E') goto yy54; + if (yych != 'e') goto yy14; +yy54: + yych = *++YYCURSOR; + if (yych == 'R') goto yy55; + if (yych != 'r') goto yy14; +yy55: + yych = *++YYCURSOR; + if (yych == 'Y') goto yy56; + if (yych != 'y') goto yy14; +yy56: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '<') { + if (yych <= 0x0000) goto yy14; + if (yych <= ';') goto yy56; + goto yy14; + } else { + if (yych != '>') goto yy56; + } + ++YYCURSOR; +#line 94 "_expander.re" + {goto gallery;} +#line 351 "_expander.cc" +yy60: + yych = *++YYCURSOR; + if (yych != '-') goto yy14; + yych = *++YYCURSOR; + if (yych != '[') goto yy14; + yych = *++YYCURSOR; + if (yych != '^') goto yy14; + yych = *++YYCURSOR; + if (yych >= 0x0001) goto yy14; + yych = *++YYCURSOR; + if (yych != '<') goto yy14; + yych = *++YYCURSOR; + if (yych != '>') goto yy14; + yych = *++YYCURSOR; + if (yych != ']') goto yy14; + yych = *++YYCURSOR; + if (yych != '*') goto yy14; + yych = *++YYCURSOR; + if (yych != '-') goto yy14; + yych = *++YYCURSOR; + if (yych != '-') goto yy14; + yych = *++YYCURSOR; + if (yych != '>') goto yy14; + ++YYCURSOR; +#line 96 "_expander.re" + {RET(5);} +#line 378 "_expander.cc" +yy73: + ++YYCURSOR; +#line 87 "_expander.re" + {RET(3);} +#line 383 "_expander.cc" +yy75: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '}') goto yy75; +#line 86 "_expander.re" + {RET(2);} +#line 390 "_expander.cc" +yy78: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '{') goto yy78; +#line 85 "_expander.re" + {RET(1);} +#line 397 "_expander.cc" +} +#line 101 "_expander.re" + + + + +noinclude: + +#line 406 "_expander.cc" +{ + YYCTYPE yych; + yych = *YYCURSOR; + if (yych <= 0x0000) goto yy86; + if (yych != '<') goto yy85; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == '/') goto yy88; +yy84: +#line 108 "_expander.re" + {goto noinclude;} +#line 417 "_expander.cc" +yy85: + yych = *++YYCURSOR; + goto yy84; +yy86: + ++YYCURSOR; +#line 109 "_expander.re" + {cursor=start+11; RET(5);} +#line 425 "_expander.cc" +yy88: + yych = *++YYCURSOR; + if (yych == 'N') goto yy90; + if (yych == 'n') goto yy90; +yy89: + YYCURSOR = YYMARKER; + goto yy84; +yy90: + yych = *++YYCURSOR; + if (yych == 'O') goto yy91; + if (yych != 'o') goto yy89; +yy91: + yych = *++YYCURSOR; + if (yych == 'I') goto yy92; + if (yych != 'i') goto yy89; +yy92: + yych = *++YYCURSOR; + if (yych == 'N') goto yy93; + if (yych != 'n') goto yy89; +yy93: + yych = *++YYCURSOR; + if (yych == 'C') goto yy94; + if (yych != 'c') goto yy89; +yy94: + yych = *++YYCURSOR; + if (yych == 'L') goto yy95; + if (yych != 'l') goto yy89; +yy95: + yych = *++YYCURSOR; + if (yych == 'U') goto yy96; + if (yych != 'u') goto yy89; +yy96: + yych = *++YYCURSOR; + if (yych == 'D') goto yy97; + if (yych != 'd') goto yy89; +yy97: + yych = *++YYCURSOR; + if (yych == 'E') goto yy98; + if (yych != 'e') goto yy89; +yy98: + yych = *++YYCURSOR; + if (yych != '>') goto yy89; + ++YYCURSOR; +#line 107 "_expander.re" + {goto std;} +#line 471 "_expander.cc" +} +#line 110 "_expander.re" + + +nowiki: + +#line 478 "_expander.cc" +{ + YYCTYPE yych; + yych = *YYCURSOR; + if (yych <= 0x0000) goto yy106; + if (yych != '<') goto yy105; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == '/') goto yy108; +yy104: +#line 115 "_expander.re" + {goto nowiki;} +#line 489 "_expander.cc" +yy105: + yych = *++YYCURSOR; + goto yy104; +yy106: + ++YYCURSOR; +#line 116 "_expander.re" + {RET(0);} +#line 497 "_expander.cc" +yy108: + yych = *++YYCURSOR; + if (yych == 'N') goto yy110; + if (yych == 'n') goto yy110; +yy109: + YYCURSOR = YYMARKER; + goto yy104; +yy110: + yych = *++YYCURSOR; + if (yych == 'O') goto yy111; + if (yych != 'o') goto yy109; +yy111: + yych = *++YYCURSOR; + if (yych == 'W') goto yy112; + if (yych != 'w') goto yy109; +yy112: + yych = *++YYCURSOR; + if (yych == 'I') goto yy113; + if (yych != 'i') goto yy109; +yy113: + yych = *++YYCURSOR; + if (yych == 'K') goto yy114; + if (yych != 'k') goto yy109; +yy114: + yych = *++YYCURSOR; + if (yych == 'I') goto yy115; + if (yych != 'i') goto yy109; +yy115: + yych = *++YYCURSOR; + if (yych != '>') goto yy109; + ++YYCURSOR; +#line 114 "_expander.re" + {RET(5);} +#line 531 "_expander.cc" +} +#line 117 "_expander.re" + + +math: + +#line 538 "_expander.cc" +{ + YYCTYPE yych; + yych = *YYCURSOR; + if (yych <= 0x0000) goto yy123; + if (yych != '<') goto yy122; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == '/') goto yy125; +yy121: +#line 122 "_expander.re" + {goto math;} +#line 549 "_expander.cc" +yy122: + yych = *++YYCURSOR; + goto yy121; +yy123: + ++YYCURSOR; +#line 123 "_expander.re" + {RET(0);} +#line 557 "_expander.cc" +yy125: + yych = *++YYCURSOR; + if (yych == 'M') goto yy127; + if (yych == 'm') goto yy127; +yy126: + YYCURSOR = YYMARKER; + goto yy121; +yy127: + yych = *++YYCURSOR; + if (yych == 'A') goto yy128; + if (yych != 'a') goto yy126; +yy128: + yych = *++YYCURSOR; + if (yych == 'T') goto yy129; + if (yych != 't') goto yy126; +yy129: + yych = *++YYCURSOR; + if (yych == 'H') goto yy130; + if (yych != 'h') goto yy126; +yy130: + yych = *++YYCURSOR; + if (yych != '>') goto yy126; + ++YYCURSOR; +#line 121 "_expander.re" + {RET(5);} +#line 583 "_expander.cc" +} +#line 124 "_expander.re" + + +gallery: + +#line 590 "_expander.cc" +{ + YYCTYPE yych; + yych = *YYCURSOR; + if (yych <= 0x0000) goto yy138; + if (yych != '<') goto yy137; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == '/') goto yy140; +yy136: +#line 129 "_expander.re" + {goto gallery;} +#line 601 "_expander.cc" +yy137: + yych = *++YYCURSOR; + goto yy136; +yy138: + ++YYCURSOR; +#line 130 "_expander.re" + {RET(0);} +#line 609 "_expander.cc" +yy140: + yych = *++YYCURSOR; + if (yych == 'G') goto yy142; + if (yych == 'g') goto yy142; +yy141: + YYCURSOR = YYMARKER; + goto yy136; +yy142: + yych = *++YYCURSOR; + if (yych == 'A') goto yy143; + if (yych != 'a') goto yy141; +yy143: + yych = *++YYCURSOR; + if (yych == 'L') goto yy144; + if (yych != 'l') goto yy141; +yy144: + yych = *++YYCURSOR; + if (yych == 'L') goto yy145; + if (yych != 'l') goto yy141; +yy145: + yych = *++YYCURSOR; + if (yych == 'E') goto yy146; + if (yych != 'e') goto yy141; +yy146: + yych = *++YYCURSOR; + if (yych == 'R') goto yy147; + if (yych != 'r') goto yy141; +yy147: + yych = *++YYCURSOR; + if (yych == 'Y') goto yy148; + if (yych != 'y') goto yy141; +yy148: + yych = *++YYCURSOR; + if (yych != '>') goto yy141; + ++YYCURSOR; +#line 128 "_expander.re" + {RET(5);} +#line 647 "_expander.cc" +} +#line 131 "_expander.re" + + +imagemap: + +#line 654 "_expander.cc" +{ + YYCTYPE yych; + yych = *YYCURSOR; + if (yych <= 0x0000) goto yy156; + if (yych != '<') goto yy155; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == '/') goto yy158; +yy154: +#line 136 "_expander.re" + {goto imagemap;} +#line 665 "_expander.cc" +yy155: + yych = *++YYCURSOR; + goto yy154; +yy156: + ++YYCURSOR; +#line 137 "_expander.re" + {RET(0);} +#line 673 "_expander.cc" +yy158: + yych = *++YYCURSOR; + if (yych == 'I') goto yy160; + if (yych == 'i') goto yy160; +yy159: + YYCURSOR = YYMARKER; + goto yy154; +yy160: + yych = *++YYCURSOR; + if (yych == 'M') goto yy161; + if (yych != 'm') goto yy159; +yy161: + yych = *++YYCURSOR; + if (yych == 'A') goto yy162; + if (yych != 'a') goto yy159; +yy162: + yych = *++YYCURSOR; + if (yych == 'G') goto yy163; + if (yych != 'g') goto yy159; +yy163: + yych = *++YYCURSOR; + if (yych == 'E') goto yy164; + if (yych != 'e') goto yy159; +yy164: + yych = *++YYCURSOR; + if (yych == 'M') goto yy165; + if (yych != 'm') goto yy159; +yy165: + yych = *++YYCURSOR; + if (yych == 'A') goto yy166; + if (yych != 'a') goto yy159; +yy166: + yych = *++YYCURSOR; + if (yych == 'P') goto yy167; + if (yych != 'p') goto yy159; +yy167: + yych = *++YYCURSOR; + if (yych != '>') goto yy159; + ++YYCURSOR; +#line 135 "_expander.re" + {RET(5);} +#line 715 "_expander.cc" +} +#line 138 "_expander.re" + + +pre: + +#line 722 "_expander.cc" +{ + YYCTYPE yych; + yych = *YYCURSOR; + if (yych <= 0x0000) goto yy175; + if (yych != '<') goto yy174; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == '/') goto yy177; +yy173: +#line 143 "_expander.re" + {goto pre;} +#line 733 "_expander.cc" +yy174: + yych = *++YYCURSOR; + goto yy173; +yy175: + ++YYCURSOR; +#line 144 "_expander.re" + {RET(0);} +#line 741 "_expander.cc" +yy177: + yych = *++YYCURSOR; + if (yych == 'P') goto yy179; + if (yych == 'p') goto yy179; +yy178: + YYCURSOR = YYMARKER; + goto yy173; +yy179: + yych = *++YYCURSOR; + if (yych == 'R') goto yy180; + if (yych != 'r') goto yy178; +yy180: + yych = *++YYCURSOR; + if (yych == 'E') goto yy181; + if (yych != 'e') goto yy178; +yy181: + yych = *++YYCURSOR; + if (yych != '>') goto yy178; + ++YYCURSOR; +#line 142 "_expander.re" + {RET(5);} +#line 763 "_expander.cc" +} +#line 145 "_expander.re" + + +} + + +PyObject *py_scan(PyObject *self, PyObject *args) +{ + PyObject *arg1; + if (!PyArg_ParseTuple(args, "O:_expander.scan", &arg1)) { + return 0; + } + PyUnicodeObject *unistr = (PyUnicodeObject*)PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "parameter cannot be converted to unicode in _expander.scan"); + return 0; + } + + Py_UNICODE *start = unistr->str; + Py_UNICODE *end = start+unistr->length; + + + MacroScanner scanner (start, end); + Py_BEGIN_ALLOW_THREADS + while (scanner.scan()) { + } + Py_END_ALLOW_THREADS + Py_XDECREF(unistr); + + // return PyList_New(0); // uncomment to see timings for scanning + + int size = scanner.tokens.size(); + PyObject *result = PyList_New(size); + if (!result) { + return 0; + } + + for (int i=0; i<size; i++) { + Token t = scanner.tokens[i]; + PyList_SET_ITEM(result, i, Py_BuildValue("iii", t.type, t.start, t.len)); + } + + return result; +} + + + +static PyMethodDef module_functions[] = { + {"scan", (PyCFunction)py_scan, METH_VARARGS, "scan(text)"}, + {0, 0}, +}; + + + +extern "C" { + DL_EXPORT(void) init_expander(); +} + +DL_EXPORT(void) init_expander() +{ + /*PyObject *m =*/ Py_InitModule("_expander", module_functions); +} diff --git a/mwlib/_expander.re b/mwlib/_expander.re new file mode 100644 index 0000000..7abb2ac --- /dev/null +++ b/mwlib/_expander.re @@ -0,0 +1,206 @@ +// -*- mode: c++ -*- +// Copyright (c) 2007-2008 PediaPress GmbH +// See README.txt for additional licensing information. + +#include <Python.h> + +#include <iostream> +#include <assert.h> +#include <vector> + +using namespace std; + +#define RET(x) {found(x); return x;} + +struct Token +{ + int type; + int start; + int len; +}; + + +class MacroScanner +{ +public: + + MacroScanner(Py_UNICODE *_start, Py_UNICODE *_end) { + source = start = _start; + end = _end; + cursor = start; + } + + int found(int val) { + if (val==5 && tokens.size()) { + Token &previous_token (tokens[tokens.size()-1]); + if (previous_token.type==val) { + previous_token.len += cursor-start; + return tokens.size()-1; + } + } + Token t; + t.type = val; + t.start = (start-source); + t.len = cursor-start; + tokens.push_back(t); + return tokens.size()-1; + } + + inline int scan(); + + Py_UNICODE *source; + + Py_UNICODE *start; + Py_UNICODE *cursor; + Py_UNICODE *end; + vector<Token> tokens; +}; + + +int MacroScanner::scan() +{ + +std: + + start=cursor; + + Py_UNICODE *marker=cursor; + + Py_UNICODE *save_cursor = cursor; + + +#define YYCTYPE Py_UNICODE +#define YYCURSOR cursor +#define YYMARKER marker +#define YYLIMIT (end) +// #define YYFILL(n) return 0; + +/*!re2c +re2c:yyfill:enable = 0 ; +*/ + + + +/*!re2c + "{"{2,} {RET(1);} + "}"{2,} {RET(2);} + "[[" | "]]" {RET(3);} + "|" {RET(6);} + + '<noinclude>' {goto noinclude;} + '<nowiki>' {goto nowiki;} + '<imagemap' [^<>\000]* '>' {goto imagemap;} + '<math' [^<>\000]* '>' {goto math;} + '<gallery' [^<>\000]* '>' {goto gallery;} + + "<!--[^\000<>]*-->" {RET(5);} + + "\000" {RET(0);} + [^\000] {RET(5);} + + */ + + + +noinclude: +/*!re2c + '</noinclude>' {goto std;} + [^\000] {goto noinclude;} + "\000" {cursor=start+11; RET(5);} + */ + +nowiki: +/*!re2c + '</nowiki>' {RET(5);} + [^\000] {goto nowiki;} + "\000" {RET(0);} + */ + +math: +/*!re2c + '</math>' {RET(5);} + [^\000] {goto math;} + "\000" {RET(0);} + */ + +gallery: +/*!re2c + '</gallery>' {RET(5);} + [^\000] {goto gallery;} + "\000" {RET(0);} + */ + +imagemap: +/*!re2c + '</imagemap>' {RET(5);} + [^\000] {goto imagemap;} + "\000" {RET(0);} + */ + +pre: +/*!re2c + '</pre>' {RET(5);} + [^\000] {goto pre;} + "\000" {RET(0);} + */ + +} + + +PyObject *py_scan(PyObject *self, PyObject *args) +{ + PyObject *arg1; + if (!PyArg_ParseTuple(args, "O:_expander.scan", &arg1)) { + return 0; + } + PyUnicodeObject *unistr = (PyUnicodeObject*)PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "parameter cannot be converted to unicode in _expander.scan"); + return 0; + } + + Py_UNICODE *start = unistr->str; + Py_UNICODE *end = start+unistr->length; + + + MacroScanner scanner (start, end); + Py_BEGIN_ALLOW_THREADS + while (scanner.scan()) { + } + Py_END_ALLOW_THREADS + Py_XDECREF(unistr); + + // return PyList_New(0); // uncomment to see timings for scanning + + int size = scanner.tokens.size(); + PyObject *result = PyList_New(size); + if (!result) { + return 0; + } + + for (int i=0; i<size; i++) { + Token t = scanner.tokens[i]; + PyList_SET_ITEM(result, i, Py_BuildValue("iii", t.type, t.start, t.len)); + } + + return result; +} + + + +static PyMethodDef module_functions[] = { + {"scan", (PyCFunction)py_scan, METH_VARARGS, "scan(text)"}, + {0, 0}, +}; + + + +extern "C" { + DL_EXPORT(void) init_expander(); +} + +DL_EXPORT(void) init_expander() +{ + /*PyObject *m =*/ Py_InitModule("_expander", module_functions); +} diff --git a/mwlib/_expander.so b/mwlib/_expander.so Binary files differnew file mode 100755 index 0000000..be09917 --- /dev/null +++ b/mwlib/_expander.so diff --git a/mwlib/_mwscan.cc b/mwlib/_mwscan.cc new file mode 100644 index 0000000..f673880 --- /dev/null +++ b/mwlib/_mwscan.cc @@ -0,0 +1,1699 @@ +/* Generated by re2c 0.13.4 */ +#line 1 "_mwscan.re" +// -*- mode: c++ -*- +// Copyright (c) 2007-2008 PediaPress GmbH +// See README.txt for additional licensing information. + +#include <Python.h> + +#include <iostream> +#include <assert.h> +#include <vector> +using namespace std; + +#define RET(x) {found(x); return x;} + +typedef enum { + t_end, + t_text, + t_entity, + t_special, + t_magicword, + t_comment, + t_2box_open, // [[ + t_2box_close, // ]] + t_http_url, + t_break, + t_begin_table, + t_end_table, + t_html_tag, + t_style, + t_pre, + t_section, + t_section_end, + t_item, + t_colon, + t_semicolon, + t_hrule, + t_newline, + t_column, + t_row, + t_tablecaption, + t_urllink, +} mwtok; + +struct Token +{ + int type; + int start; + int len; +}; + +class Scanner +{ +public: + + Scanner(Py_UNICODE *_start, Py_UNICODE *_end) { + source = start = _start; + end = _end; + cursor = start; + line_startswith_section = -1; + tablemode=0; + } + + int found(mwtok val) { + if (val==t_text && tokens.size()) { + Token &previous_token (tokens[tokens.size()-1]); + if (previous_token.type==val) { + previous_token.len += cursor-start; + return tokens.size()-1; + } + } + Token t; + t.type = val; + t.start = (start-source); + t.len = cursor-start; + tokens.push_back(t); + return tokens.size()-1; + } + + bool bol() const { + return (start==source) || (start[-1]=='\n'); + } + + bool eol() const { + return *cursor=='\n' || *cursor==0; + } + + void newline() { + if (line_startswith_section>=0) { + tokens[line_startswith_section].type = t_text; + } + line_startswith_section = -1; + } + + inline int scan(); + + Py_UNICODE *source; + + Py_UNICODE *start; + Py_UNICODE *cursor; + Py_UNICODE *end; + vector<Token> tokens; + + int line_startswith_section; + int tablemode; +}; + + +int Scanner::scan() +{ + start=cursor; + + Py_UNICODE *marker=cursor; + + Py_UNICODE *save_cursor = cursor; + + +#define YYCTYPE Py_UNICODE +#define YYCURSOR cursor +#define YYMARKER marker +#define YYLIMIT (end) +// #define YYFILL(n) return 0; + +#line 124 "_mwscan.re" + + +/* + the re2c manpage says: + "The user must arrange for a sentinel token to appear at the end of input" + \000 is our sentinel token. +*/ + +#line 157 "_mwscan.re" + + if (!bol()) { + goto not_bol; + } + +#line 140 "_mwscan.cc" +{ + YYCTYPE yych; + unsigned int yyaccept = 0; + + yych = *YYCURSOR; + if (yych <= '-') { + if (yych <= '"') { + if (yych <= 0x001F) goto yy18; + if (yych <= ' ') goto yy2; + if (yych <= '!') goto yy8; + goto yy18; + } else { + if (yych <= ')') { + if (yych <= '#') goto yy13; + goto yy18; + } else { + if (yych <= '*') goto yy13; + if (yych <= ',') goto yy18; + goto yy17; + } + } + } else { + if (yych <= '<') { + if (yych <= '9') goto yy18; + if (yych <= ':') goto yy11; + if (yych <= ';') goto yy15; + goto yy18; + } else { + if (yych <= 'z') { + if (yych <= '=') goto yy9; + goto yy18; + } else { + if (yych <= '{') goto yy4; + if (yych <= '|') goto yy6; + goto yy18; + } + } + } +yy2: + yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= 0x001F) goto yy3; + if (yych <= '!') goto yy47; + if (yych <= 'z') goto yy3; + if (yych <= '|') goto yy47; +yy3: +#line 199 "_mwscan.re" + {RET(t_pre);} +#line 189 "_mwscan.cc" +yy4: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '|') goto yy43; +yy5: +#line 209 "_mwscan.re" + {goto not_bol;} +#line 196 "_mwscan.cc" +yy6: + ++YYCURSOR; + if ((yych = *YYCURSOR) <= ',') { + if (yych == '+') goto yy35; + } else { + if (yych <= '-') goto yy38; + if (yych == '}') goto yy41; + } +yy7: +#line 177 "_mwscan.re" + { + if (tablemode) + RET(t_column); + + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } +#line 217 "_mwscan.cc" +yy8: + yych = *++YYCURSOR; + goto yy7; +yy9: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '=') goto yy33; + goto yy32; +yy10: +#line 200 "_mwscan.re" + { + line_startswith_section = found(t_section); + return t_section; + } +#line 231 "_mwscan.cc" +yy11: + ++YYCURSOR; + if ((yych = *YYCURSOR) <= ')') { + if (yych == '#') goto yy27; + } else { + if (yych <= '*') goto yy27; + if (yych == ':') goto yy29; + } +yy12: +#line 205 "_mwscan.re" + {RET(t_colon);} +#line 243 "_mwscan.cc" +yy13: + ++YYCURSOR; + yych = *YYCURSOR; + goto yy28; +yy14: +#line 204 "_mwscan.re" + {RET(t_item);} +#line 251 "_mwscan.cc" +yy15: + ++YYCURSOR; + yych = *YYCURSOR; + goto yy26; +yy16: +#line 206 "_mwscan.re" + {RET(t_semicolon);} +#line 259 "_mwscan.cc" +yy17: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == '-') goto yy19; + goto yy5; +yy18: + yych = *++YYCURSOR; + goto yy5; +yy19: + yych = *++YYCURSOR; + if (yych == '-') goto yy21; +yy20: + YYCURSOR = YYMARKER; + if (yyaccept <= 0) { + goto yy3; + } else { + goto yy5; + } +yy21: + yych = *++YYCURSOR; + if (yych != '-') goto yy20; +yy22: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '-') goto yy22; +#line 207 "_mwscan.re" + {RET(t_hrule);} +#line 287 "_mwscan.cc" +yy25: + ++YYCURSOR; + yych = *YYCURSOR; +yy26: + if (yych == ';') goto yy25; + goto yy16; +yy27: + ++YYCURSOR; + yych = *YYCURSOR; +yy28: + if (yych == '#') goto yy27; + if (yych == '*') goto yy27; + goto yy14; +yy29: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ')') { + if (yych == '#') goto yy27; + goto yy12; + } else { + if (yych <= '*') goto yy27; + if (yych == ':') goto yy29; + goto yy12; + } +yy31: + ++YYCURSOR; + yych = *YYCURSOR; +yy32: + if (yych == '\t') goto yy31; + if (yych == ' ') goto yy31; + goto yy10; +yy33: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= 0x001F) { + if (yych == '\t') goto yy31; + goto yy10; + } else { + if (yych <= ' ') goto yy31; + if (yych == '=') goto yy33; + goto yy10; + } +yy35: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '+') goto yy35; +#line 189 "_mwscan.re" + { + if (tablemode) + RET(t_tablecaption); + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } +#line 344 "_mwscan.cc" +yy38: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '-') goto yy38; +#line 166 "_mwscan.re" + { + if (tablemode) + RET(t_row); + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } +#line 359 "_mwscan.cc" +yy41: + ++YYCURSOR; +#line 163 "_mwscan.re" + {--tablemode; RET(t_end_table);} +#line 364 "_mwscan.cc" +yy43: + ++YYCURSOR; +#line 162 "_mwscan.re" + {++tablemode; RET(t_begin_table);} +#line 369 "_mwscan.cc" +yy45: + yych = *++YYCURSOR; + if (yych <= ',') { + if (yych == '+') goto yy35; + goto yy7; + } else { + if (yych <= '-') goto yy38; + if (yych == '}') goto yy41; + goto yy7; + } +yy46: + ++YYCURSOR; + yych = *YYCURSOR; +yy47: + if (yych <= '!') { + if (yych <= 0x001F) goto yy20; + if (yych <= ' ') goto yy46; + } else { + if (yych <= 'z') goto yy20; + if (yych <= '{') goto yy49; + if (yych <= '|') goto yy45; + goto yy20; + } + yych = *++YYCURSOR; + goto yy7; +yy49: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '|') goto yy43; + goto yy20; +} +#line 210 "_mwscan.re" + + + +not_bol: + cursor = save_cursor; + marker = cursor; + + +#line 409 "_mwscan.cc" +{ + YYCTYPE yych; + unsigned int yyaccept = 0; + yych = *YYCURSOR; + if (yych <= 'Z') { + if (yych <= '\'') { + if (yych <= ' ') { + if (yych <= 0x0000) goto yy72; + if (yych == '\n') goto yy63; + goto yy74; + } else { + if (yych <= '!') goto yy66; + if (yych <= '%') goto yy74; + if (yych <= '&') goto yy71; + goto yy69; + } + } else { + if (yych <= ';') { + if (yych <= '/') goto yy74; + if (yych <= '9') goto yy59; + if (yych <= ':') goto yy68; + goto yy74; + } else { + if (yych <= '<') goto yy70; + if (yych <= '=') goto yy61; + if (yych <= '@') goto yy74; + goto yy59; + } + } + } else { + if (yych <= 'f') { + if (yych <= '^') { + if (yych <= '[') goto yy52; + if (yych == ']') goto yy60; + goto yy74; + } else { + if (yych <= '_') goto yy58; + if (yych <= '`') goto yy74; + if (yych <= 'e') goto yy59; + goto yy56; + } + } else { + if (yych <= 'm') { + if (yych == 'h') goto yy57; + if (yych <= 'l') goto yy59; + goto yy54; + } else { + if (yych <= 'z') goto yy59; + if (yych == '|') goto yy65; + goto yy74; + } + } + } +yy52: + yyaccept = 0; + yych = *(YYMARKER = ++YYCURSOR); + switch (yych) { + case '[': goto yy249; + case 'f': goto yy252; + case 'h': goto yy251; + case 'm': goto yy253; + default: goto yy53; + } +yy53: +#line 256 "_mwscan.re" + {RET(t_special);} +#line 476 "_mwscan.cc" +yy54: + ++YYCURSOR; + if ((yych = *YYCURSOR) == 'a') goto yy237; + goto yy121; +yy55: +#line 225 "_mwscan.re" + {RET(t_text);} +#line 484 "_mwscan.cc" +yy56: + yych = *++YYCURSOR; + if (yych == 't') goto yy229; + goto yy121; +yy57: + yych = *++YYCURSOR; + if (yych == 't') goto yy219; + goto yy121; +yy58: + yych = *++YYCURSOR; + if (yych == '_') goto yy122; + goto yy121; +yy59: + yych = *++YYCURSOR; + goto yy121; +yy60: + yych = *++YYCURSOR; + if (yych == ']') goto yy118; + goto yy53; +yy61: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '=') goto yy116; + goto yy115; +yy62: +#line 228 "_mwscan.re" + { + if (eol()) { + if (line_startswith_section>=0) { + line_startswith_section=-1; + RET(t_section_end); + } else { + RET(t_text); + } + } else { + RET(t_text); + } + } +#line 522 "_mwscan.cc" +yy63: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '\n') goto yy111; +#line 241 "_mwscan.re" + {newline(); RET(t_newline);} +#line 528 "_mwscan.cc" +yy65: + yych = *++YYCURSOR; + if (yych <= '*') { + if (yych == '!') goto yy107; + goto yy53; + } else { + if (yych <= '+') goto yy109; + if (yych == '|') goto yy107; + goto yy53; + } +yy66: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '!') goto yy107; +yy67: +#line 266 "_mwscan.re" + {RET(t_text);} +#line 545 "_mwscan.cc" +yy68: + yych = *++YYCURSOR; + goto yy53; +yy69: + yych = *++YYCURSOR; + if (yych == '\'') goto yy102; + goto yy67; +yy70: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= '/') { + if (yych == '!') goto yy86; + if (yych <= '.') goto yy67; + goto yy87; + } else { + if (yych <= 'Z') { + if (yych <= '@') goto yy67; + goto yy88; + } else { + if (yych <= '`') goto yy67; + if (yych <= 'z') goto yy88; + goto yy67; + } + } +yy71: + yyaccept = 1; + yych = *(YYMARKER = ++YYCURSOR); + if (yych <= '9') { + if (yych == '#') goto yy75; + if (yych <= '/') goto yy67; + goto yy77; + } else { + if (yych <= 'Z') { + if (yych <= '@') goto yy67; + goto yy77; + } else { + if (yych <= '`') goto yy67; + if (yych <= 'z') goto yy77; + goto yy67; + } + } +yy72: + ++YYCURSOR; +#line 265 "_mwscan.re" + {newline(); return t_end;} +#line 591 "_mwscan.cc" +yy74: + yych = *++YYCURSOR; + goto yy67; +yy75: + yych = *++YYCURSOR; + if (yych <= 'W') { + if (yych <= '/') goto yy76; + if (yych <= '9') goto yy82; + } else { + if (yych <= 'X') goto yy81; + if (yych == 'x') goto yy81; + } +yy76: + YYCURSOR = YYMARKER; + if (yyaccept <= 1) { + if (yyaccept <= 0) { + goto yy53; + } else { + goto yy67; + } + } else { + if (yyaccept <= 2) { + goto yy103; + } else { + goto yy55; + } + } +yy77: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ';') { + if (yych <= '/') goto yy76; + if (yych <= '9') goto yy77; + if (yych <= ':') goto yy76; + } else { + if (yych <= 'Z') { + if (yych <= '@') goto yy76; + goto yy77; + } else { + if (yych <= '`') goto yy76; + if (yych <= 'z') goto yy77; + goto yy76; + } + } +yy79: + ++YYCURSOR; +#line 263 "_mwscan.re" + {RET(t_entity);} +#line 640 "_mwscan.cc" +yy81: + yych = *++YYCURSOR; + if (yych == ';') goto yy76; + goto yy85; +yy82: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '/') goto yy76; + if (yych <= '9') goto yy82; + if (yych == ';') goto yy79; + goto yy76; +yy84: + ++YYCURSOR; + yych = *YYCURSOR; +yy85: + if (yych <= ';') { + if (yych <= '/') goto yy76; + if (yych <= '9') goto yy84; + if (yych <= ':') goto yy76; + goto yy79; + } else { + if (yych <= 'F') { + if (yych <= '@') goto yy76; + goto yy84; + } else { + if (yych <= '`') goto yy76; + if (yych <= 'f') goto yy84; + goto yy76; + } + } +yy86: + yych = *++YYCURSOR; + if (yych == '-') goto yy94; + goto yy76; +yy87: + yych = *++YYCURSOR; + if (yych <= '@') goto yy76; + if (yych <= 'Z') goto yy88; + if (yych <= '`') goto yy76; + if (yych >= '{') goto yy76; +yy88: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '>') { + if (yych <= ';') { + if (yych <= 0x0000) goto yy76; + } else { + if (yych <= '<') goto yy76; + if (yych >= '>') goto yy92; + } + } else { + if (yych <= 'Z') { + if (yych >= 'A') goto yy88; + } else { + if (yych <= '`') goto yy90; + if (yych <= 'z') goto yy88; + } + } +yy90: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '<') { + if (yych <= 0x0000) goto yy76; + if (yych <= ';') goto yy90; + goto yy76; + } else { + if (yych != '>') goto yy90; + } +yy92: + ++YYCURSOR; +#line 259 "_mwscan.re" + {RET(t_html_tag);} +#line 713 "_mwscan.cc" +yy94: + yych = *++YYCURSOR; + if (yych != '-') goto yy76; +yy95: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ';') { + if (yych <= 0x0000) goto yy76; + if (yych != '-') goto yy95; + } else { + if (yych == '=') goto yy95; + if (yych <= '>') goto yy76; + goto yy95; + } + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ';') { + if (yych <= 0x0000) goto yy76; + if (yych != '-') goto yy95; + } else { + if (yych == '=') goto yy95; + if (yych <= '>') goto yy76; + goto yy95; + } +yy98: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= ';') { + if (yych <= 0x0000) goto yy76; + if (yych == '-') goto yy98; + goto yy95; + } else { + if (yych <= '<') goto yy76; + if (yych != '>') goto yy95; + } + ++YYCURSOR; +#line 262 "_mwscan.re" + {RET(t_comment);} +#line 752 "_mwscan.cc" +yy102: + ++YYCURSOR; + if ((yych = *YYCURSOR) == '\'') goto yy104; +yy103: +#line 257 "_mwscan.re" + {RET(t_style);} +#line 759 "_mwscan.cc" +yy104: + yyaccept = 2; + yych = *(YYMARKER = ++YYCURSOR); + if (yych != '\'') goto yy103; + yych = *++YYCURSOR; + if (yych != '\'') goto yy76; + yych = *++YYCURSOR; + goto yy103; +yy107: + ++YYCURSOR; +#line 243 "_mwscan.re" + { + if (tablemode) + RET(t_column); + cursor = start+1; + RET(t_special); + } +#line 777 "_mwscan.cc" +yy109: + ++YYCURSOR; +#line 250 "_mwscan.re" + { + if (tablemode) + RET(t_tablecaption); + cursor = start+1; + RET(t_special); + } +#line 787 "_mwscan.cc" +yy111: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych == '\n') goto yy111; +#line 240 "_mwscan.re" + {newline(); RET(t_break);} +#line 794 "_mwscan.cc" +yy114: + ++YYCURSOR; + yych = *YYCURSOR; +yy115: + if (yych == '\t') goto yy114; + if (yych == ' ') goto yy114; + goto yy62; +yy116: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= 0x001F) { + if (yych == '\t') goto yy114; + goto yy62; + } else { + if (yych <= ' ') goto yy114; + if (yych == '=') goto yy116; + goto yy62; + } +yy118: + ++YYCURSOR; +#line 227 "_mwscan.re" + {RET(t_2box_close);} +#line 817 "_mwscan.cc" +yy120: + ++YYCURSOR; + yych = *YYCURSOR; +yy121: + if (yych <= 'Z') { + if (yych <= '/') goto yy55; + if (yych <= '9') goto yy120; + if (yych <= '@') goto yy55; + goto yy120; + } else { + if (yych <= '_') { + if (yych <= '^') goto yy55; + goto yy120; + } else { + if (yych <= '`') goto yy55; + if (yych <= 'z') goto yy120; + goto yy55; + } + } +yy122: + yych = *++YYCURSOR; + switch (yych) { + case 'E': goto yy126; + case 'F': goto yy125; + case 'N': goto yy124; + case 'S': goto yy127; + case 'T': goto yy123; + default: goto yy121; + } +yy123: + yych = *++YYCURSOR; + if (yych == 'O') goto yy216; + goto yy121; +yy124: + yych = *++YYCURSOR; + if (yych == 'E') goto yy146; + if (yych == 'O') goto yy147; + goto yy121; +yy125: + yych = *++YYCURSOR; + if (yych == 'O') goto yy138; + goto yy121; +yy126: + yych = *++YYCURSOR; + if (yych == 'N') goto yy135; + goto yy121; +yy127: + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'A') goto yy121; + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; +yy133: + ++YYCURSOR; + if ((yych = *YYCURSOR) <= 'Z') { + if (yych <= '/') goto yy134; + if (yych <= '9') goto yy120; + if (yych >= 'A') goto yy120; + } else { + if (yych <= '_') { + if (yych >= '_') goto yy120; + } else { + if (yych <= '`') goto yy134; + if (yych <= 'z') goto yy120; + } + } +yy134: +#line 224 "_mwscan.re" + {RET(t_magicword);} +#line 894 "_mwscan.cc" +yy135: + yych = *++YYCURSOR; + if (yych != 'D') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy138: + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy146: + yych = *++YYCURSOR; + if (yych == 'W') goto yy203; + goto yy121; +yy147: + yych = *++YYCURSOR; + switch (yych) { + case 'C': goto yy150; + case 'E': goto yy149; + case 'G': goto yy151; + case 'T': goto yy148; + default: goto yy121; + } +yy148: + yych = *++YYCURSOR; + if (yych <= 'H') { + if (yych == 'C') goto yy186; + goto yy121; + } else { + if (yych <= 'I') goto yy187; + if (yych == 'O') goto yy188; + goto yy121; + } +yy149: + yych = *++YYCURSOR; + if (yych == 'D') goto yy175; + goto yy121; +yy150: + yych = *++YYCURSOR; + if (yych == 'C') goto yy159; + if (yych == 'O') goto yy160; + goto yy121; +yy151: + yych = *++YYCURSOR; + if (yych != 'A') goto yy121; + yych = *++YYCURSOR; + if (yych != 'L') goto yy121; + yych = *++YYCURSOR; + if (yych != 'L') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'Y') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy159: + yych = *++YYCURSOR; + if (yych == '_') goto yy174; + goto yy121; +yy160: + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'V') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy174: + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy175: + yych = *++YYCURSOR; + if (yych != 'I') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'S') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'I') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy186: + yych = *++YYCURSOR; + if (yych == '_') goto yy202; + goto yy121; +yy187: + yych = *++YYCURSOR; + if (yych == 'T') goto yy191; + goto yy121; +yy188: + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy191: + yych = *++YYCURSOR; + if (yych != 'L') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'V') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'R') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy202: + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy203: + yych = *++YYCURSOR; + if (yych != 'S') goto yy121; + yych = *++YYCURSOR; + if (yych != 'E') goto yy121; + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != 'T') goto yy121; + yych = *++YYCURSOR; + if (yych != 'I') goto yy121; + yych = *++YYCURSOR; + if (yych != 'O') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'L') goto yy121; + yych = *++YYCURSOR; + if (yych != 'I') goto yy121; + yych = *++YYCURSOR; + if (yych != 'N') goto yy121; + yych = *++YYCURSOR; + if (yych != 'K') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy216: + yych = *++YYCURSOR; + if (yych != 'C') goto yy121; + yych = *++YYCURSOR; + if (yych != '_') goto yy121; + yych = *++YYCURSOR; + if (yych == '_') goto yy133; + goto yy121; +yy219: + yych = *++YYCURSOR; + if (yych != 't') goto yy121; + yych = *++YYCURSOR; + if (yych != 'p') goto yy121; + yyaccept = 3; + yych = *(YYMARKER = ++YYCURSOR); + if (yych == ':') goto yy223; + if (yych != 's') goto yy121; + yyaccept = 3; + yych = *(YYMARKER = ++YYCURSOR); + if (yych != ':') goto yy121; +yy223: + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych <= '_') { + if (yych <= ':') { + if (yych <= '&') { + if (yych == '#') goto yy226; + if (yych <= '$') goto yy76; + } else { + if (yych <= '\'') goto yy76; + if (yych == '*') goto yy76; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy226; + if (yych <= '>') goto yy76; + } else { + if (yych <= '@') goto yy76; + if (yych <= 'Z') goto yy226; + if (yych <= '^') goto yy76; + } + } + } else { + if (yych <= 0x00D6) { + if (yych <= '~') { + if (yych <= '`') goto yy76; + if (yych <= 'z') goto yy226; + if (yych <= '}') goto yy76; + } else { + if (yych == 0x00C4) goto yy226; + if (yych <= 0x00D5) goto yy76; + } + } else { + if (yych <= 0x00E4) { + if (yych == 0x00DC) goto yy226; + if (yych <= 0x00E3) goto yy76; + } else { + if (yych <= 0x00F6) { + if (yych <= 0x00F5) goto yy76; + } else { + if (yych != 0x00FC) goto yy76; + } + } + } + } +yy226: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '_') { + if (yych <= ':') { + if (yych <= '&') { + if (yych == '#') goto yy226; + if (yych >= '%') goto yy226; + } else { + if (yych <= '\'') goto yy228; + if (yych != '*') goto yy226; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy226; + if (yych >= '?') goto yy226; + } else { + if (yych <= '@') goto yy228; + if (yych <= 'Z') goto yy226; + if (yych >= '_') goto yy226; + } + } + } else { + if (yych <= 0x00D6) { + if (yych <= '~') { + if (yych <= '`') goto yy228; + if (yych <= 'z') goto yy226; + if (yych >= '~') goto yy226; + } else { + if (yych == 0x00C4) goto yy226; + if (yych >= 0x00D6) goto yy226; + } + } else { + if (yych <= 0x00E4) { + if (yych == 0x00DC) goto yy226; + if (yych >= 0x00E4) goto yy226; + } else { + if (yych <= 0x00F6) { + if (yych >= 0x00F6) goto yy226; + } else { + if (yych == 0x00FC) goto yy226; + } + } + } + } +yy228: +#line 223 "_mwscan.re" + {RET(t_http_url);} +#line 1221 "_mwscan.cc" +yy229: + yych = *++YYCURSOR; + if (yych != 'p') goto yy121; + yyaccept = 3; + yych = *(YYMARKER = ++YYCURSOR); + if (yych != ':') goto yy121; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych <= '=') { + if (yych <= '&') { + if (yych <= '"') goto yy76; + if (yych == '%') goto yy76; + } else { + if (yych <= '\'') goto yy76; + if (yych <= ':') goto yy234; + if (yych <= '<') goto yy76; + } + } else { + if (yych <= '_') { + if (yych <= '>') goto yy76; + if (yych <= 'Z') goto yy234; + if (yych <= '^') goto yy76; + } else { + if (yych <= '{') { + if (yych <= '`') goto yy76; + } else { + if (yych <= '|') goto yy76; + if (yych >= 0x007F) goto yy76; + } + } + } +yy234: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '=') { + if (yych <= '&') { + if (yych <= '"') goto yy236; + if (yych != '%') goto yy234; + } else { + if (yych <= '\'') goto yy236; + if (yych <= ':') goto yy234; + if (yych >= '=') goto yy234; + } + } else { + if (yych <= '_') { + if (yych <= '>') goto yy236; + if (yych <= 'Z') goto yy234; + if (yych >= '_') goto yy234; + } else { + if (yych <= '{') { + if (yych >= 'a') goto yy234; + } else { + if (yych <= '|') goto yy236; + if (yych <= '~') goto yy234; + } + } + } +yy236: +#line 221 "_mwscan.re" + {RET(t_http_url);} +#line 1285 "_mwscan.cc" +yy237: + yych = *++YYCURSOR; + if (yych != 'i') goto yy121; + yych = *++YYCURSOR; + if (yych != 'l') goto yy121; + yych = *++YYCURSOR; + if (yych != 't') goto yy121; + yych = *++YYCURSOR; + if (yych != 'o') goto yy121; + yyaccept = 3; + yych = *(YYMARKER = ++YYCURSOR); + if (yych != ':') goto yy121; + yych = *++YYCURSOR; + if (yych == '@') goto yy76; + goto yy244; +yy243: + ++YYCURSOR; + yych = *YYCURSOR; +yy244: + if (yych <= '9') { + if (yych <= '\'') { + if (yych == '!') goto yy243; + if (yych <= '"') goto yy76; + goto yy243; + } else { + if (yych <= ')') goto yy76; + if (yych == ',') goto yy76; + goto yy243; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy243; + if (yych <= '>') goto yy76; + goto yy243; + } else { + if (yych <= 'Z') { + if (yych >= 'A') goto yy243; + } else { + if (yych <= ']') goto yy76; + if (yych <= '~') goto yy243; + goto yy76; + } + } + } + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '.') { + if (yych <= ',') goto yy76; + } else { + if (yych <= '/') goto yy76; + if (yych >= ':') goto yy76; + } + } else { + if (yych <= '_') { + if (yych <= 'Z') goto yy246; + if (yych <= '^') goto yy76; + } else { + if (yych <= '`') goto yy76; + if (yych >= '{') goto yy76; + } + } +yy246: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '@') { + if (yych <= '.') { + if (yych >= '-') goto yy246; + } else { + if (yych <= '/') goto yy248; + if (yych <= '9') goto yy246; + } + } else { + if (yych <= '_') { + if (yych <= 'Z') goto yy246; + if (yych >= '_') goto yy246; + } else { + if (yych <= '`') goto yy248; + if (yych <= 'z') goto yy246; + } + } +yy248: +#line 219 "_mwscan.re" + {RET(t_http_url);} +#line 1369 "_mwscan.cc" +yy249: + ++YYCURSOR; +#line 226 "_mwscan.re" + {RET(t_2box_open);} +#line 1374 "_mwscan.cc" +yy251: + yych = *++YYCURSOR; + if (yych == 't') goto yy274; + goto yy76; +yy252: + yych = *++YYCURSOR; + if (yych == 't') goto yy266; + goto yy76; +yy253: + yych = *++YYCURSOR; + if (yych != 'a') goto yy76; + yych = *++YYCURSOR; + if (yych != 'i') goto yy76; + yych = *++YYCURSOR; + if (yych != 'l') goto yy76; + yych = *++YYCURSOR; + if (yych != 't') goto yy76; + yych = *++YYCURSOR; + if (yych != 'o') goto yy76; + yych = *++YYCURSOR; + if (yych != ':') goto yy76; + yych = *++YYCURSOR; + if (yych == '@') goto yy76; + goto yy261; +yy260: + ++YYCURSOR; + yych = *YYCURSOR; +yy261: + if (yych <= '9') { + if (yych <= '\'') { + if (yych == '!') goto yy260; + if (yych <= '"') goto yy76; + goto yy260; + } else { + if (yych <= ')') goto yy76; + if (yych == ',') goto yy76; + goto yy260; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy260; + if (yych <= '>') goto yy76; + goto yy260; + } else { + if (yych <= 'Z') { + if (yych >= 'A') goto yy260; + } else { + if (yych <= ']') goto yy76; + if (yych <= '~') goto yy260; + goto yy76; + } + } + } + yych = *++YYCURSOR; + if (yych <= '@') { + if (yych <= '.') { + if (yych <= ',') goto yy76; + } else { + if (yych <= '/') goto yy76; + if (yych >= ':') goto yy76; + } + } else { + if (yych <= '_') { + if (yych <= 'Z') goto yy263; + if (yych <= '^') goto yy76; + } else { + if (yych <= '`') goto yy76; + if (yych >= '{') goto yy76; + } + } +yy263: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '@') { + if (yych <= '.') { + if (yych >= '-') goto yy263; + } else { + if (yych <= '/') goto yy265; + if (yych <= '9') goto yy263; + } + } else { + if (yych <= '_') { + if (yych <= 'Z') goto yy263; + if (yych >= '_') goto yy263; + } else { + if (yych <= '`') goto yy265; + if (yych <= 'z') goto yy263; + } + } +yy265: +#line 218 "_mwscan.re" + {RET(t_urllink);} +#line 1467 "_mwscan.cc" +yy266: + yych = *++YYCURSOR; + if (yych != 'p') goto yy76; + yych = *++YYCURSOR; + if (yych != ':') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych <= '=') { + if (yych <= '&') { + if (yych <= '"') goto yy76; + if (yych == '%') goto yy76; + } else { + if (yych <= '\'') goto yy76; + if (yych <= ':') goto yy271; + if (yych <= '<') goto yy76; + } + } else { + if (yych <= '_') { + if (yych <= '>') goto yy76; + if (yych <= 'Z') goto yy271; + if (yych <= '^') goto yy76; + } else { + if (yych <= '{') { + if (yych <= '`') goto yy76; + } else { + if (yych <= '|') goto yy76; + if (yych >= 0x007F) goto yy76; + } + } + } +yy271: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '=') { + if (yych <= '&') { + if (yych <= '"') goto yy273; + if (yych != '%') goto yy271; + } else { + if (yych <= '\'') goto yy273; + if (yych <= ':') goto yy271; + if (yych >= '=') goto yy271; + } + } else { + if (yych <= '_') { + if (yych <= '>') goto yy273; + if (yych <= 'Z') goto yy271; + if (yych >= '_') goto yy271; + } else { + if (yych <= '{') { + if (yych >= 'a') goto yy271; + } else { + if (yych <= '|') goto yy273; + if (yych <= '~') goto yy271; + } + } + } +yy273: +#line 220 "_mwscan.re" + {RET(t_urllink);} +#line 1530 "_mwscan.cc" +yy274: + yych = *++YYCURSOR; + if (yych != 't') goto yy76; + yych = *++YYCURSOR; + if (yych != 'p') goto yy76; + yych = *++YYCURSOR; + if (yych == ':') goto yy278; + if (yych != 's') goto yy76; + yych = *++YYCURSOR; + if (yych != ':') goto yy76; +yy278: + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych != '/') goto yy76; + yych = *++YYCURSOR; + if (yych <= '_') { + if (yych <= ':') { + if (yych <= '&') { + if (yych == '#') goto yy281; + if (yych <= '$') goto yy76; + } else { + if (yych <= '\'') goto yy76; + if (yych == '*') goto yy76; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy281; + if (yych <= '>') goto yy76; + } else { + if (yych <= '@') goto yy76; + if (yych <= 'Z') goto yy281; + if (yych <= '^') goto yy76; + } + } + } else { + if (yych <= 0x00D6) { + if (yych <= '~') { + if (yych <= '`') goto yy76; + if (yych <= 'z') goto yy281; + if (yych <= '}') goto yy76; + } else { + if (yych == 0x00C4) goto yy281; + if (yych <= 0x00D5) goto yy76; + } + } else { + if (yych <= 0x00E4) { + if (yych == 0x00DC) goto yy281; + if (yych <= 0x00E3) goto yy76; + } else { + if (yych <= 0x00F6) { + if (yych <= 0x00F5) goto yy76; + } else { + if (yych != 0x00FC) goto yy76; + } + } + } + } +yy281: + ++YYCURSOR; + yych = *YYCURSOR; + if (yych <= '_') { + if (yych <= ':') { + if (yych <= '&') { + if (yych == '#') goto yy281; + if (yych >= '%') goto yy281; + } else { + if (yych <= '\'') goto yy283; + if (yych != '*') goto yy281; + } + } else { + if (yych <= '?') { + if (yych == '=') goto yy281; + if (yych >= '?') goto yy281; + } else { + if (yych <= '@') goto yy283; + if (yych <= 'Z') goto yy281; + if (yych >= '_') goto yy281; + } + } + } else { + if (yych <= 0x00D6) { + if (yych <= '~') { + if (yych <= '`') goto yy283; + if (yych <= 'z') goto yy281; + if (yych >= '~') goto yy281; + } else { + if (yych == 0x00C4) goto yy281; + if (yych >= 0x00D6) goto yy281; + } + } else { + if (yych <= 0x00E4) { + if (yych == 0x00DC) goto yy281; + if (yych >= 0x00E4) goto yy281; + } else { + if (yych <= 0x00F6) { + if (yych >= 0x00F6) goto yy281; + } else { + if (yych == 0x00FC) goto yy281; + } + } + } + } +yy283: +#line 222 "_mwscan.re" + {RET(t_urllink);} +#line 1637 "_mwscan.cc" +} +#line 267 "_mwscan.re" + +} + + +PyObject *py_scan(PyObject *self, PyObject *args) +{ + PyObject *arg1; + if (!PyArg_ParseTuple(args, "O:mwscan.scan", &arg1)) { + return 0; + } + PyUnicodeObject *unistr = (PyUnicodeObject*)PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "parameter cannot be converted to unicode in mwscan.scan"); + return 0; + } + + Py_UNICODE *start = unistr->str; + Py_UNICODE *end = start+unistr->length; + + + Scanner scanner (start, end); + Py_BEGIN_ALLOW_THREADS + while (scanner.scan()) { + } + Py_END_ALLOW_THREADS + Py_XDECREF(unistr); + + // return PyList_New(0); // uncomment to see timings for scanning + + int size = scanner.tokens.size(); + PyObject *result = PyList_New(size); + if (!result) { + return 0; + } + + for (int i=0; i<size; i++) { + Token t = scanner.tokens[i]; + PyList_SET_ITEM(result, i, Py_BuildValue("iii", t.type, t.start, t.len)); + } + + return result; +} + + + +static PyMethodDef module_functions[] = { + {"scan", (PyCFunction)py_scan, METH_VARARGS, "scan(text)"}, + {0, 0}, +}; + + + +extern "C" { + DL_EXPORT(void) init_mwscan(); +} + +DL_EXPORT(void) init_mwscan() +{ + /*PyObject *m =*/ Py_InitModule("_mwscan", module_functions); +} diff --git a/mwlib/_mwscan.re b/mwlib/_mwscan.re new file mode 100644 index 0000000..99ffe95 --- /dev/null +++ b/mwlib/_mwscan.re @@ -0,0 +1,327 @@ +// -*- mode: c++ -*- +// Copyright (c) 2007-2008 PediaPress GmbH +// See README.txt for additional licensing information. + +#include <Python.h> + +#include <iostream> +#include <assert.h> +#include <vector> +using namespace std; + +#define RET(x) {found(x); return x;} + +typedef enum { + t_end, + t_text, + t_entity, + t_special, + t_magicword, + t_comment, + t_2box_open, // [[ + t_2box_close, // ]] + t_http_url, + t_break, + t_begin_table, + t_end_table, + t_html_tag, + t_style, + t_pre, + t_section, + t_section_end, + t_item, + t_colon, + t_semicolon, + t_hrule, + t_newline, + t_column, + t_row, + t_tablecaption, + t_urllink, +} mwtok; + +struct Token +{ + int type; + int start; + int len; +}; + +class Scanner +{ +public: + + Scanner(Py_UNICODE *_start, Py_UNICODE *_end) { + source = start = _start; + end = _end; + cursor = start; + line_startswith_section = -1; + tablemode=0; + } + + int found(mwtok val) { + if (val==t_text && tokens.size()) { + Token &previous_token (tokens[tokens.size()-1]); + if (previous_token.type==val) { + previous_token.len += cursor-start; + return tokens.size()-1; + } + } + Token t; + t.type = val; + t.start = (start-source); + t.len = cursor-start; + tokens.push_back(t); + return tokens.size()-1; + } + + bool bol() const { + return (start==source) || (start[-1]=='\n'); + } + + bool eol() const { + return *cursor=='\n' || *cursor==0; + } + + void newline() { + if (line_startswith_section>=0) { + tokens[line_startswith_section].type = t_text; + } + line_startswith_section = -1; + } + + inline int scan(); + + Py_UNICODE *source; + + Py_UNICODE *start; + Py_UNICODE *cursor; + Py_UNICODE *end; + vector<Token> tokens; + + int line_startswith_section; + int tablemode; +}; + + +int Scanner::scan() +{ + start=cursor; + + Py_UNICODE *marker=cursor; + + Py_UNICODE *save_cursor = cursor; + + +#define YYCTYPE Py_UNICODE +#define YYCURSOR cursor +#define YYMARKER marker +#define YYLIMIT (end) +// #define YYFILL(n) return 0; + +/*!re2c +re2c:yyfill:enable = 0 ; +*/ + +/* + the re2c manpage says: + "The user must arrange for a sentinel token to appear at the end of input" + \000 is our sentinel token. +*/ + +/*!re2c + any = [^\000]; + ftp = "ftp://" [-a-zA-Z0-9_+${}~?=/@#&*(),:.]+ ; + mailto = "mailto:" [-a-zA-Z0-9_!#$%*./?|^{}`~&'+=]+ "@" [-a-zA-Z0-9_.]+ ; + url = "http" "s"? "://" [-\xe4\xc4\xf6\xd6\xfc\xdca-zA-Z_0-9./?=&:%:~()#+,]+ ; + entity_name = "&" [a-zA-Z0-9]+ ";"; + entity_hex = "&#" 'x' [a-fA-F0-9]+ ";"; + entity_dec = "&#" [0-9]+ ";"; + + entity = (entity_name | entity_hex | entity_dec); + + + magicword = ( "__TOC__" + | "__NOTOC__" + | "__FORCETOC__" + | "__NOEDITSECTION__" + | "__NEWSECTIONLINK__" + | "__NOCONTENTCONVERT__" + | "__NOCC__" + | "__NOGALLERY__" + | "__NOTITLECONVERT__" + | "__NOTC__" + | "__END__" + | "__START__" + ); +*/ + if (!bol()) { + goto not_bol; + } +/*!re2c + " "* "{|" {++tablemode; RET(t_begin_table);} + " "* "|}" {--tablemode; RET(t_end_table);} + + " "* "|" "-"+ + { + if (tablemode) + RET(t_row); + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } + + " "* ("|" | "!") + { + if (tablemode) + RET(t_column); + + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } + + " "* "|" "+"+ + { + if (tablemode) + RET(t_tablecaption); + if (*start==' ') { + cursor = start+1; + RET(t_pre); + } + RET(t_text); + } + + " " {RET(t_pre);} + "="+ [ \t]* { + line_startswith_section = found(t_section); + return t_section; + } + ":"* [#*]+ {RET(t_item);} + ":"+ {RET(t_colon);} + ";"+ {RET(t_semicolon);} + "-"{4,} {RET(t_hrule);} + + [^] {goto not_bol;} + */ + + +not_bol: + cursor = save_cursor; + marker = cursor; + +/*!re2c + "[" mailto {RET(t_urllink);} + mailto {RET(t_http_url);} + "[" ftp {RET(t_urllink);} + ftp {RET(t_http_url);} + "[" url {RET(t_urllink);} + url {RET(t_http_url);} + magicword {RET(t_magicword);} + [a-zA-Z0-9_]+ {RET(t_text);} + "[[" {RET(t_2box_open);} + "]]" {RET(t_2box_close);} + "="+ [ \t]* { + if (eol()) { + if (line_startswith_section>=0) { + line_startswith_section=-1; + RET(t_section_end); + } else { + RET(t_text); + } + } else { + RET(t_text); + } + } + "\n"{2,} {newline(); RET(t_break);} + "\n" {newline(); RET(t_newline);} + "||" | "|!" | "!!" + { + if (tablemode) + RET(t_column); + cursor = start+1; + RET(t_special); + } + "|+" + { + if (tablemode) + RET(t_tablecaption); + cursor = start+1; + RET(t_special); + } + [:|\[\]] {RET(t_special);} + "'''''" | "'''" | "''" {RET(t_style);} + "<" "/"? [a-zA-Z]+ [^\000<>]* "/"? ">" + {RET(t_html_tag);} + + "<!--"[^\000<>]*"-->" + {RET(t_comment);} + entity {RET(t_entity);} + + "\000" {newline(); return t_end;} + . {RET(t_text);} +*/ +} + + +PyObject *py_scan(PyObject *self, PyObject *args) +{ + PyObject *arg1; + if (!PyArg_ParseTuple(args, "O:mwscan.scan", &arg1)) { + return 0; + } + PyUnicodeObject *unistr = (PyUnicodeObject*)PyUnicode_FromObject(arg1); + if (unistr == NULL) { + PyErr_SetString(PyExc_TypeError, + "parameter cannot be converted to unicode in mwscan.scan"); + return 0; + } + + Py_UNICODE *start = unistr->str; + Py_UNICODE *end = start+unistr->length; + + + Scanner scanner (start, end); + Py_BEGIN_ALLOW_THREADS + while (scanner.scan()) { + } + Py_END_ALLOW_THREADS + Py_XDECREF(unistr); + + // return PyList_New(0); // uncomment to see timings for scanning + + int size = scanner.tokens.size(); + PyObject *result = PyList_New(size); + if (!result) { + return 0; + } + + for (int i=0; i<size; i++) { + Token t = scanner.tokens[i]; + PyList_SET_ITEM(result, i, Py_BuildValue("iii", t.type, t.start, t.len)); + } + + return result; +} + + + +static PyMethodDef module_functions[] = { + {"scan", (PyCFunction)py_scan, METH_VARARGS, "scan(text)"}, + {0, 0}, +}; + + + +extern "C" { + DL_EXPORT(void) init_mwscan(); +} + +DL_EXPORT(void) init_mwscan() +{ + /*PyObject *m =*/ Py_InitModule("_mwscan", module_functions); +} diff --git a/mwlib/_mwscan.so b/mwlib/_mwscan.so Binary files differnew file mode 100755 index 0000000..50feae0 --- /dev/null +++ b/mwlib/_mwscan.so diff --git a/mwlib/_version.py b/mwlib/_version.py new file mode 100644 index 0000000..9e73d4f --- /dev/null +++ b/mwlib/_version.py @@ -0,0 +1,9 @@ +class _Version(tuple): + """internal version object, subclass of C{tuple}, + but implements a fancier __str__ representation + """ + def __str__(self): + return '.'.join([str(x) for x in self]) + +version = _Version((0,6,1)) +del _Version diff --git a/mwlib/advtree.py b/mwlib/advtree.py new file mode 100644 index 0000000..52d57e5 --- /dev/null +++ b/mwlib/advtree.py @@ -0,0 +1,545 @@ +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +""" +The parse tree generated by the parser is a 1:1 representation of the mw-markup. +Unfortunally these trees have some flaws if used to geenerate derived documents. + +This module seeks to rebuild the parstree +to be: + * more logical markup + * clean up the parse tree + * make it more accessible + * allow for validity checks + * implement rebuilding strategies + +Usefull Documentation: +http://en.wikipedia.org/wiki/Wikipedia:Don%27t_use_line_breaks +http://meta.wikimedia.org/wiki/Help:Advanced_editing +""" + +import weakref +from mwlib.parser import Magic, Math, _VListNode, Ref, Link, URL, NamedURL # not used but imported +from mwlib.parser import CategoryLink, SpecialLink, Caption, LangLink # not used but imported +from mwlib.parser import Item, ItemList, Node, Table, Row, Cell, Paragraph, PreFormatted +from mwlib.parser import Section, Style, TagNode, Text, Timeline +from mwlib.parser import ImageLink, Article, Book, Chapter +import copy +from mwlib.log import Log + +log = Log("advtree") + + +def _idIndex(lst, el): + # return first appeareance of element in list + for i, e in enumerate(lst): + if e is el: + return i + return -1 + +class AdvancedNode: + """ + MixIn Class that extends Nodes so they become easier accessible + + allows to traverse the tree in any direction and + build derived convinience functions + """ + _parentref = None # weak referece to parent element + isblocknode = False + + def copy(self): + "return a copy of this node and all its children" + n = copy.copy(self) + n.children = [] + n._parentref = None + for c in self: + n.appendChild(c.copy()) + return n + + + def moveto(self, targetnode, prefix=False): + """ + moves this node after target node + if prefix is true, move in front of target node + """ + if self.parent: + self.parent.removeChild(self) + tp = targetnode.parent + idx = _idIndex(tp.children, targetnode) + if not prefix: + idx+=1 + tp.children = tp.children[:idx] + [self] + tp.children[idx:] + self._parentref = weakref.ref(tp) + + def appendChild(self, c): + self.children.append(c) + c._parentref = weakref.ref(self) + + def remove(self): + if self.parent: + for (idx, n) in enumerate(self.parent.children): + if n is self: + self.parent.children = self.parent.children[:idx] + self.parent.children[idx+1:] + return 0 + else: + return 1 + + def removeChild(self, c): + self.replaceChild(c, []) + + def replaceChild(self, c, newchildren = []): + idx = _idIndex(self.children, c) + self.children.remove(c) + c._parentref = None + if newchildren: + self.children = self.children[:idx] + newchildren + self.children[idx:] + for nc in newchildren: + nc._parentref = weakref.ref(self) + + def getParents(self): + if self.parent: + return self.parent.getParents() + [self.parent] + else: + return [] + + def getParent(self): + if not self._parentref: + return None + x = self._parentref() + if not x: + raise weakref.ReferenceError + return x + + def getLevel(self): + "returns the number of nodes of same class in parents" + return [p.__class__ for p in self.getParents()].count(self.__class__) + + + def getParentNodesByClass(self, klass): + "returns parents w/ klass" + return [p for p in self.parents if p.__class__ == klass] + + def getChildNodesByClass(self, klass): + "returns all children w/ klass" + return [p for p in self.getAllChildren() if p.__class__ == klass] + + def getAllChildren(self): + "don't confuse w/ Node.allchildren() which returns allchildren + self" + for c in self.children: + yield c + for x in c.getAllChildren(): + yield x + + def getSiblings(self): + return [c for c in self.getAllSiblings() if c is not self] + + def getAllSiblings(self): + "all siblings plus me my self and i" + if self.parent: + return self.parent.children + return [] + + def getPrevious(self): + "return previous sibling" + s = self.getAllSiblings() + try: + idx = _idIndex(s,self) + except ValueError: + return None + if idx -1 <0: + return None + else: + return s[idx-1] + + def getNext(self): + "return next sibling" + s = self.getAllSiblings() + try: + idx = _idIndex(s,self) + except ValueError: + return None + if idx+1 >= len(s): + return None + else: + return s[idx+1] + + def getLast(self): + "return last sibling" + s = self.getAllSiblings() + if s: + return s[-1] + + def getFirst(self): + "return first sibling" + s = self.getAllSiblings() + if s: + return s[0] + + def getLastChild(self): + "return last child of this node" + if self.children: + return self.children[-1] + + def getFirstChild(self): + "return first child of this node" + if self.children: + return self.children[0] + + def getAllDisplayText(self, amap = None): + "return all text that is intended for display" + text = [] + if not amap: + amap = {Text:"caption", Link:"target", URL:"caption", Math:"caption", ImageLink:"caption" } + for n in self.allchildren(): + access = amap.get(n.__class__, "") + if access: + text.append( getattr(n, access) ) + alltext = [t for t in text if t] + if alltext: + return u''.join(alltext) + else: + return '' + + parent = property(getParent) + parents = property(getParents) + next = property(getNext) + previous = property(getPrevious) + siblings = property(getSiblings) + last = property(getLast) + first = property(getFirst) + lastchild = property(getLastChild) + firstchild = property(getFirstChild) + + + +# -------------------------------------------------------------------------- +# MixinClasses w/ special behaviour +# ------------------------------------------------------------------------- + +class AdvancedTable(AdvancedNode): + @property + def rows(self): + return [r for r in self if r.__class__ == Row] + + @property + def numcols(self): + cols = [[n.__class__ for n in row].count(Cell) for row in self.rows] + if cols: + return max(cols) + else: + return 0 + +class AdvancedRow(AdvancedNode): + @property + def cells(self): + return [c for c in self if c.__class__ == Cell] + + +class AdvancedSection(AdvancedNode): + h_level = 0 # this is set if it originates from an H1, H2, ... TagNode + def getSectionLevel(self): + return 1 + self.getLevel() + +class AdvancedImageLink(AdvancedNode): + isblocknode = property ( lambda s: not s.isInline() ) + +class AdvancedMath(AdvancedNode): + @property + def isblocknode(self): + if self.caption.strip().startswith("\\begin{align}") or \ + self.caption.strip().startswith("\\begin{alignat}"): + return True + return False + + + +# -------------------------------------------------------------------------- +# Missing as Classes derived from parser.Style +# ------------------------------------------------------------------------- + + +class Emphasized(Style, AdvancedNode): + "EM" + pass + +class Strong(Style, AdvancedNode): + pass + +class DefinitionList(Style, AdvancedNode): + "DL" + pass + +class DefinitionTerm(Style, AdvancedNode): + "DT" + pass + +class DefinitionDescription(Style, AdvancedNode): + "DD" + pass + +class Blockquote(Style, AdvancedNode): + "margins to left & right" + pass + +class Indented(Style, AdvancedNode): + "margin to the left" + +class Overline(Style, AdvancedNode): + _style = "overline" + +class Underline(Style, AdvancedNode): + _style = "u" + +class Sub(Style, AdvancedNode): + _style = "sub" + +class Sup(Style, AdvancedNode): + _style = "sup" + +class Small(Style, AdvancedNode): + _style = "small" + +class Big(Style, AdvancedNode): + _style = "big" + +class Cite(Style, AdvancedNode): + _style = "cite" + + +_styleNodeMap = dict( (k._style,k) for k in [Overline, Underline, Sub, Sup, Small, Big, Cite] ) + +# -------------------------------------------------------------------------- +# Missing as Classes derived from parser.TagNode +# ------------------------------------------------------------------------- + +class Source(TagNode, AdvancedNode): + _tag = "source" + +class Code(TagNode, AdvancedNode): + _tag = "code" + +class BreakingReturn(TagNode, AdvancedNode): + _tag = "br" + +class HorizontalRule(TagNode, AdvancedNode): + _tag = "hr" + +class Index(TagNode, AdvancedNode): + _tag = "index" + +class Teletyped(TagNode, AdvancedNode): + _tag = "tt" + +class Reference(TagNode, AdvancedNode): + _tag = "ref" + +class ReferenceList(TagNode, AdvancedNode): + _tag = "references" + +class Gallery(TagNode, AdvancedNode): + _tag = "gallery" + +class Center(TagNode, AdvancedNode): + _tag = "center" + +class Div(TagNode, AdvancedNode): + _tag = "div" + +class Span(TagNode, AdvancedNode): # span is defined as inline node which is in theory correct. + _tag = "span" + +class Strike(TagNode,AdvancedNode): + _tag = "strike" + +class ImageMap(TagNode, AdvancedNode): # defined as block node, maybe incorrect + _tag = "imagemap" + +_tagNodeMap = dict( (k._tag,k) for k in [Source, Code, BreakingReturn, HorizontalRule, Index, Teletyped, Reference, ReferenceList, Gallery, Center, Div, Span, Strike, ImageMap] ) +_styleNodeMap["s"] = Strike # Special Handling for deprecated s style + + +# -------------------------------------------------------------------------- +# BlockNode separation for AdvancedNode.isblocknode +# ------------------------------------------------------------------------- + +""" +For writers it is usefull to know whether elements are inline (within a paragraph) or not. +We define list for blocknodes, which are used in AdvancedNode as: + +AdvancedNode.isblocknode + +Image depends on result of Image.isInline() see above + +Open Issues: Math, Magic, (unknown) TagNode + +""" +_blockNodesMap = (Book, Chapter, Article, Section, Paragraph, Div, + PreFormatted, Cell, Row, Table, Item, BreakingReturn, + ItemList, Timeline, Cite, HorizontalRule, Gallery, Indented, + DefinitionList, DefinitionTerm, DefinitionDescription, ReferenceList, Source) + +for k in _blockNodesMap: + k.isblocknode = True + + + +# -------------------------------------------------------------------------- +# funcs for extending the nodes +# ------------------------------------------------------------------------- + +def MixIn(pyClass, mixInClass, makeFirst=False): + if mixInClass not in pyClass.__bases__: + if makeFirst: + pyClass.__bases__ = (mixInClass,) + pyClass.__bases__ + else: + pyClass.__bases__ += (mixInClass,) + +def extendClasses(node): + for c in node.children[:]: + extendClasses(c) + c._parentref = weakref.ref(node) + +# Nodes we defined above and that are separetly handled in extendClasses +_advancedNodesMap = {Section: AdvancedSection, ImageLink:AdvancedImageLink, + Math:AdvancedMath, Row:AdvancedRow, Table:AdvancedTable} +MixIn(Node, AdvancedNode) +for k, v in _advancedNodesMap.items(): + MixIn(k,v) + +# -------------------------------------------------------------------------- +# funcs for repairing the tree +# ------------------------------------------------------------------------- + + +def fixTagNodes(node): + """ + detect known TagNode(s) and associate appropriate Nodes + """ + for c in node.children: + if c.__class__ == TagNode: + if c.caption in _tagNodeMap: + c.__class__ = _tagNodeMap[c.caption] + elif c.caption in ("h1", "h2", "h3", "h4", "h5", "h6"): # FIXME + # NEED TO MOVE NODE IF IT REALLY STARTS A SECTION + c.__class__ = Section + MixIn(c.__class__, AdvancedSection) + c.level = int(c.caption[1]) + c.caption = "" + else: + log.warn("fixTagNodes, unknowntagnode %r" % c) + #raise Exception, "unknown tag %s" % c.caption # FIXME + fixTagNodes(c) + + +def fixStyle(node): + """ + parser.Style Nodes are mapped to logical markup + detection of DefinitionList depends on removeNodes + and removeNewlines + """ + if not node.__class__ == Style: + return + # replace this node by a more apporiate + if node.caption == "''": + node.__class__ = Emphasized + node.caption = "" + elif node.caption=="'''''": + node.__class__ = Strong + node.caption = "" + em = Emphasized("''") + for c in node.children: + em.appendChild(c) + node.children = [] + node.appendChild(em) + elif node.caption == "'''": + node.__class__ = Strong + node.caption = "" + elif node.caption == ";": + # this starts a definition list ? DL [DT->DD, ...] + # check if previous node is DefinitionList, if not create one + if node.previous.__class__ == DefinitionList: + node.__class__ = DefinitionTerm + node.moveto(node.previous.lastchild) + else: + node.__class__ = DefinitionList + dt = DefinitionTerm() + for c in node.children: + dt.appendChild(c) + node.children = [] + node.appendChild(dt) + elif node.caption.startswith(":"): + if node.previous.__class__ == DefinitionList: + node.__class__ = DefinitionDescription + node.moveto(node.previous.lastchild) + node.caption = "" + else: + node.__class__ = Indented + elif node.caption in _styleNodeMap: + node.__class__ = _styleNodeMap[node.caption] + node.caption = "" + else: + log.warn("fixStyle, unknownstyle %r" % node) + #raise Exception, "unknown style %s" % node.caption # FIXME + pass + return node + +def fixStyles(node): + if node.__class__ == Style: + fixStyle(node) + for c in node.children[:]: + fixStyles(c) + + +def removeNodes(node): + """ + the parser generates empty Node elements that do + nothing but group other nodes. we remove them here + """ + if node.__class__ == Node: + # first child of section groups heading text - grouping Node must not be removed + if not (node.previous == None and node.parent.__class__ == Section): + node.parent.replaceChild(node, node.children) + for c in node.children[:]: + removeNodes(c) + +def removeNewlines(node): + """ + remove newlines, tabs, spaces if we are next to a blockNode + """ + if node.__class__ == Text and not node.getParentNodesByClass(PreFormatted) and not node.getParentNodesByClass(Source): + if node.caption.strip() == u"": + prev = node.previous or node.parent # previous sibling node or parentnode + next = node.next or node.parent.next + if not next or next.isblocknode or not prev or prev.isblocknode: + node.parent.removeChild(node) + node.caption = node.caption.replace("\n", " ") + + for c in node.children[:]: + removeNewlines(c) + + + + +def buildAdvancedTree(root): # USE WITH CARE + """ + extends and cleans parse trees + do not use this funcs without knowing whether these + Node modifications fit your problem + """ + extendClasses(root) + fixTagNodes(root) + removeNodes(root) + removeNewlines(root) + fixStyles(root) + +def getAdvTree(fn): + from mwlib.dummydb import DummyDB + from mwlib.uparser import parseString + db = DummyDB() + input = unicode(open(fn).read(), 'utf8') + r = parseString(title=fn, raw=input, wikidb=db) + buildAdvancedTree(r) + return r + + + diff --git a/mwlib/allnodes.py b/mwlib/allnodes.py new file mode 100644 index 0000000..ebd6d3b --- /dev/null +++ b/mwlib/allnodes.py @@ -0,0 +1,27 @@ +import mwlib.parser +import mwlib.advtree + +import types + +def allnodes(): + all = set() + names = set() + for m in (mwlib.parser, mwlib.advtree): + for x in dir(m): + if x in names: + continue + k = getattr(m, x) + if type(k) == types.TypeType: + if issubclass(k, mwlib.parser.Node): + all.add(k) + names.add(x) + return all + + +if __name__ == '__main__': + # EXAMPLE THAT SHOWS HOW TO IDENTIFY MISSING NODES + from mwlib.parser import Control, Chapter + my = set((Control, Chapter)) + missing = allnodes() - my + assert len(missing) == len(allnodes()) -2 + #print missing diff --git a/mwlib/apps.py b/mwlib/apps.py new file mode 100644 index 0000000..55a427e --- /dev/null +++ b/mwlib/apps.py @@ -0,0 +1,378 @@ + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +"""main programs - installed via setuptools' entry_points""" + +import optparse + +def buildcdb(): + parser = optparse.OptionParser(usage="%prog --input XMLDUMP --output OUTPUT") + parser.add_option("-i", "--input", help="input file") + parser.add_option("-o", "--output", help="write output to OUTPUT") + options, args = parser.parse_args() + + if args: + parser.error("too many arguments.") + + + input = options.input + output = options.output + + if not (input and output): + parser.error("missing argument.") + + import os + from mwlib import cdbwiki + + cdbwiki.BuildWiki(input, output)() + open(os.path.join(output, "wikiconf.txt"), "w").write(""" +[wiki] +type = cdb +path = %s + +[images] +type = download +url = http://upload.wikimedia.org/wikipedia/commons/ +localpath = ~/images +""" % (os.path.abspath(output),)) + +def show(): + parser = optparse.OptionParser(usage="%prog [-e|--expand] --conf CONF ARTICLE [...]") + parser.add_option("-c", "--conf", help="config file") + parser.add_option("-e", "--expand", action="store_true", help="expand templates") + parser.add_option("-t", "--template", action="store_true", help="show template") + + options, args = parser.parse_args() + + if not args: + parser.error("missing ARTICLE argument") + + articles = [unicode(x, 'utf-8') for x in args] + + conf = options.conf + if not conf: + parser.error("missing --conf argument") + + from mwlib import wiki, expander + + db = wiki.makewiki(conf)['wiki'] + + for a in articles: + if options.template: + raw=db.getTemplate(a) + else: + raw=db.getRawArticle(a) + + if raw: + if options.expand: + te = expander.Expander(raw, pagename=a, wikidb=db) + raw = te.expandTemplates() + + print raw.encode("utf-8") + + +def buildzip(): + parser = optparse.OptionParser(usage="%prog [OPTIONS] [ARTICLE ...]") + parser.add_option("-c", "--conf", help="config file (required unless --baseurl is given)") + parser.add_option("-b", "--baseurl", help="base URL for mwapidb backend") + parser.add_option("-s", "--shared-baseurl", help="base URL for shared images for mwapidb backend") + parser.add_option("-m", "--metabook", help="JSON encoded text file with book structure") + parser.add_option('--collectionpage', help='Title of a collection page') + parser.add_option("-x", "--noimages", action="store_true", help="exclude images") + parser.add_option("-o", "--output", help="write output to OUTPUT") + parser.add_option("-p", "--posturl", help="http post to POSTURL") + parser.add_option("-i", "--imagesize", + help="max. pixel size (width or height) for images (default: 800)") + parser.add_option("-d", "--daemonize", action="store_true", + help='become daemon after collection articles (before POST request)') + parser.add_option("-l", "--logfile", help="log to logfile") + parser.add_option("--license", help="Title of article containing full license text") + parser.add_option("--template-blacklist", help="Title of article containing blacklisted templates") + options, args = parser.parse_args() + + import tempfile + import os + import zipfile + + from mwlib import utils + from mwlib.utils import daemonize + + articles = [unicode(x, 'utf-8') for x in args] + + baseurl = options.baseurl + conf = options.conf + if not baseurl and not options.conf: + parser.error("neither --conf nor --baseurl specified\nuse --help for all options") + + posturl = None + def post_status(status): + print 'status:', status + if not posturl: + return + try: + return urllib2.urlopen(posturl, urllib.urlencode({'status': status})).read() + except Exception, e: + print 'ERROR posting status %r to %r' % (status, posturl) + + def post_progress(progress): + print 'progress', progress + if not posturl: + return + try: + return urllib2.urlopen(posturl, urllib.urlencode({'progress': int(progress)})).read() + except Exception, e: + print 'ERROR posting progress %r to %r' % (progress, posturl) + + try: + if options.logfile: + utils.start_logging(options.logfile) + + output = options.output + + from mwlib import wiki, recorddb, metabook + + mb = metabook.MetaBook() + if conf: + from ConfigParser import ConfigParser + + w = wiki.makewiki(conf) + cp = ConfigParser() + cp.read(conf) + license = { + 'name': cp.get('wiki', 'defaultarticlelicense') + } + if license['name'] is not None: + license['wikitext'] = w['wiki'].getRawArticle(license['name']) + mb.source = { + 'name': cp.get('wiki', 'name'), + 'url': cp.get('wiki', 'url'), + 'defaultarticlelicense': license, + } + else: + w = { + 'wiki': wiki.wiki_mwapi(baseurl, options.license, options.template_blacklist), + 'images': wiki.image_mwapi(baseurl, shared_base_url=options.shared_baseurl) + } + metadata = w['wiki'].getMetaData() + mb.source = { + 'name': metadata['name'], + 'url': metadata['url'], + 'defaultarticlelicense': metadata['license'], + } + + if options.noimages: + w['images'] = None + else: + if options.imagesize: + imagesize = int(options.imagesize) + else: + imagesize = 800 + + if output: + zipfilename = output + else: + fd, zipfilename = tempfile.mkstemp() + os.close(fd) + + if options.collectionpage: + mwcollection = w['wiki'].getRawArticle(options.collectionpage) + mb.loadCollectionPage(mwcollection) + elif options.metabook: + mb.readJsonFile(options.metabook) + + # do not daemonize earlier: Collection extension deletes input metabook file! + if options.daemonize: + daemonize() + + posturl = options.posturl + if posturl: + posturl = posturl.encode('utf-8') + + from mwlib.utils import get_multipart + import urllib + import urllib2 + + zf = zipfile.ZipFile(zipfilename, 'w') + z = recorddb.ZipfileCreator(zf, w['wiki'], w['images']) + + post_status('parsing') + + for x in articles: + z.addArticle(x) + mb.addArticles(articles) + + z.addObject('metabook.json', mb.dumpJson()) + articles = list(mb.getArticles()) + if articles: + inc = 70/len(articles) + else: + inc = 0 + p = 0 + for title, revision in articles: + post_progress(p) + z.addArticle(title, revision=revision) + p += inc + + post_status('packaging') + + if not options.noimages: + z.writeImages(size=imagesize) + + post_progress(80) + + z.writeContent() + zf.close() + + post_progress(90) + + if posturl: + post_status('uploading') + zf = open(zipfilename, "rb") + ct, data = get_multipart('collection.zip', zf.read(), 'collection') + zf.close() + req = urllib2.Request(posturl, data=data, headers={"Content-Type": ct}) + result = urllib2.urlopen(req).read() + + if w['images']: + w['images'].clear() + + if not output: + os.unlink(zipfilename) + + post_status('finished') + post_progress(100) + except Exception, e: + post_status('error') + raise + + +def parse(): + parser = optparse.OptionParser(usage="%prog [-a|--all] --conf CONF [ARTICLE1 ...]") + parser.add_option("-a", "--all", action="store_true", help="parse all articles") + parser.add_option("--tb", action="store_true", help="show traceback on error") + + parser.add_option("-c", "--conf", help="config file") + + options, args = parser.parse_args() + + if not args and not options.all: + parser.error("missing option.") + + if not options.conf: + parser.error("missing --conf argument") + + articles = [unicode(x, 'utf-8') for x in args] + + conf = options.conf + + import traceback + from mwlib import wiki, uparser + + w = wiki.makewiki(conf) + + db = w['wiki'] + + if options.all: + if not hasattr(db, "articles"): + raise RuntimeError("%s does not support iterating over all articles" % (db, )) + articles = db.articles() + + + import time + for x in articles: + try: + raw = db.getRawArticle(x) + # yes, raw can be None, when we have a redirect to a non-existing article. + if raw is None: + continue + stime=time.time() + a=uparser.parseString(x, raw=raw, wikidb=db) + except Exception, err: + print "F", repr(x), err + if options.tb: + traceback.print_exc() + else: + print "G", time.time()-stime, repr(x) + +def serve(): + parser = optparse.OptionParser(usage="%prog --conf CONF ARTICLE [...]") + parser.add_option("-c", "--conf", help="config file") + + options, args = parser.parse_args() + + + conf = options.conf + if not options.conf: + parser.error("missing --conf argument") + + from mwlib import wiki, web + + res = wiki.makewiki(conf) + db = res['wiki'] + images = res['images'] + from wsgiref.simple_server import make_server, WSGIServer + + from SocketServer import ForkingMixIn + class MyServer(ForkingMixIn, WSGIServer): + pass + + iface, port = '0.0.0.0', 8080 + print "serving on %s:%s" % (iface, port) + http = make_server(iface, port, web.Serve(db, res['images']), server_class=MyServer) + http.serve_forever() + + + +def html(): + parser = optparse.OptionParser(usage="%prog --conf CONF ARTICLE [...]") + parser.add_option("-c", "--conf", help="config file") + + options, args = parser.parse_args() + + if not args: + parser.error("missing ARTICLE argument") + + articles = [unicode(x, 'utf-8') for x in args] + + conf = options.conf + if not options.conf: + parser.error("missing --conf argument") + + import StringIO + import tempfile + import os + import webbrowser + from mwlib import wiki, uparser, htmlwriter + + res = wiki.makewiki(conf) + db = res['wiki'] + images = res['images'] + + for a in articles: + raw=db.getRawArticle(a) + if not raw: + continue + + out=StringIO.StringIO() + out.write("""<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> +<head> +<meta http-equiv="content-type" content="text/html; charset="utf-8"></meta> +<link rel="stylesheet" href="pedia.css" /> +</head> +<body> + +""") + + a=uparser.parseString(x, raw=raw, wikidb=db) + w=htmlwriter.HTMLWriter(out, images) + w.write(a) + + fd, htmlfile = tempfile.mkstemp(".html") + os.close(fd) + open(htmlfile, "wb").write(out.getvalue().encode('utf-8')) + webbrowser.open("file://"+htmlfile) + + diff --git a/mwlib/caller.py b/mwlib/caller.py new file mode 100755 index 0000000..583a123 --- /dev/null +++ b/mwlib/caller.py @@ -0,0 +1,20 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import sys +import os + +def caller(n=2): + """return caller as string""" + f = sys._getframe(n) + return "%s:%s" % (f.f_code.co_filename, f.f_lineno) + +def short(n=2): + """return caller as string""" + f = sys._getframe(n) + return "%s:%s" % (os.path.basename(f.f_code.co_filename), f.f_lineno) + +def callerframe(n=2): + return sys._getframe(n) diff --git a/mwlib/cdb.py b/mwlib/cdb.py new file mode 100755 index 0000000..9aa2a65 --- /dev/null +++ b/mwlib/cdb.py @@ -0,0 +1,262 @@ +#! /usr/bin/env python +""" +Dan Bernstein's CDB implemented in Python + +see http://cr.yp.to/cdb.html + +""" + +from __future__ import generators + +import os +import struct +import mmap + +def uint32_unpack(buf): + return struct.unpack('<L', buf)[0] + +def uint32_pack(n): + return struct.pack('<L', n) + +CDB_HASHSTART = 5381 + +def cdb_hash(buf): + h = CDB_HASHSTART + for c in buf: + h = (h + (h << 5)) & 0xffffffffL + h ^= ord(c) + return h + +class Cdb(object): + + def __init__(self, fp): + self.fp = fp + fd = fp.fileno() + self.size = os.fstat(fd).st_size + self.map = mmap.mmap(fd, self.size, access=mmap.ACCESS_READ) + self.eod = uint32_unpack(self.map[:4]) + self.findstart() + self.loop = 0 # number of hash slots searched under this key + # initialized if loop is nonzero + self.khash = 0 + self.hpos = 0 + self.hslots = 0 + # initialized if findnext() returns 1 + self.dpos = 0 + self.dlen = 0 + + def close(self): + self.map.close() + + def __iter__(self, fn=None): + len = 2048 + while len < self.eod: + klen, vlen = struct.unpack("<LL", self.map[len:len+8]) + len += 8 + key = self.map[len:len+klen] + len += klen + val = self.map[len:len+vlen] + len += vlen + if fn: + yield fn(key, val) + else: + yield (key, val) + + def iteritems(self): + return self.__iter__() + + def iterkeys(self): + return self.__iter__(lambda k,v: k) + + def itervalues(self): + return self.__iter__(lambda k,v: v) + + def items(self): + ret = [] + for i in self.iteritems(): + ret.append(i) + return ret + + def keys(self): + ret = [] + for i in self.iterkeys(): + ret.append(i) + return ret + + def values(self): + ret = [] + for i in self.itervalues(): + ret.append(i) + return ret + + def findstart(self): + self.loop = 0 + + def read(self, n, pos): + # XXX add code for platforms without mmap + return self.map[pos:pos+n] + + def match(self, key, pos): + if key == self.read(len(key), pos): + return 1 + else: + return 0 + + def findnext(self, key): + if not self.loop: + u = cdb_hash(key) + buf = self.read(8, u << 3 & 2047) + self.hslots = uint32_unpack(buf[4:]) + if not self.hslots: + raise KeyError + self.hpos = uint32_unpack(buf[:4]) + self.khash = u + u >>= 8 + u %= self.hslots + u <<= 3 + self.kpos = self.hpos + u + + while self.loop < self.hslots: + buf = self.read(8, self.kpos) + pos = uint32_unpack(buf[4:]) + if not pos: + raise KeyError + self.loop += 1 + self.kpos += 8 + if self.kpos == self.hpos + (self.hslots << 3): + self.kpos = self.hpos + u = uint32_unpack(buf[:4]) + if u == self.khash: + buf = self.read(8, pos) + u = uint32_unpack(buf[:4]) + if u == len(key): + if self.match(key, pos + 8): + dlen = uint32_unpack(buf[4:]) + dpos = pos + 8 + len(key) + return self.read(dlen, dpos) + raise KeyError + + def __getitem__(self, key): + self.findstart() + return self.findnext(key) + + def get(self, key, default=None): + self.findstart() + try: + return self.findnext(key) + except KeyError: + return default + +def cdb_dump(infile): + """dump a database in djb's cdbdump format""" + db = Cdb(infile) + for key,value in db.iteritems(): + print "+%d,%d:%s->%s" % (len(key), len(value), key, value) + print + +def cdb_make(outfile, items): + pos = 2048 + tables = {} # { h & 255 : [(h, p)] } + + # write keys and data + outfile.seek(pos) + for key, value in items: + outfile.write(uint32_pack(len(key)) + uint32_pack(len(value))) + h = cdb_hash(key) + outfile.write(key) + outfile.write(value) + tables.setdefault(h & 255, []).append((h, pos)) + pos += 8 + len(key) + len(value) + + final = '' + # write hash tables + for i in range(256): + entries = tables.get(i, []) + nslots = 2*len(entries) + final += uint32_pack(pos) + uint32_pack(nslots) + null = (0, 0) + table = [null] * nslots + for h, p in entries: + n = (h >> 8) % nslots + while table[n] is not null: + n = (n + 1) % nslots + table[n] = (h, p) + for h, p in table: + outfile.write(uint32_pack(h) + uint32_pack(p)) + pos += 8 + + # write header (pointers to tables and their lengths) + outfile.flush() + outfile.seek(0) + outfile.write(final) + +class CdbMake(object): + def __init__(self, outfile): + self.pos = 2048 + self.outfile = outfile + self.outfile.seek(self.pos) + self.tables = {} + + def add(self, key, value): + outfile = self.outfile + outfile.write(uint32_pack(len(key)) + uint32_pack(len(value))) + h = cdb_hash(key) + outfile.write(key) + outfile.write(value) + self.tables.setdefault(h & 255, []).append((h, self.pos)) + self.pos += 8 + len(key) + len(value) + + def finish(self): + final = '' + tables = self.tables + pos = self.pos + outfile = self.outfile + + # write hash tables + for i in range(256): + entries = tables.get(i, []) + nslots = 2*len(entries) + final += uint32_pack(pos) + uint32_pack(nslots) + null = (0, 0) + table = [null] * nslots + for h, p in entries: + n = (h >> 8) % nslots + while table[n] is not null: + n = (n + 1) % nslots + table[n] = (h, p) + for h, p in table: + outfile.write(uint32_pack(h) + uint32_pack(p)) + pos += 8 + + # write header (pointers to tables and their lengths) + outfile.flush() + outfile.seek(0) + outfile.write(final) + + +def test(): + #db = Cdb(open("t")) + #print db['one'] + #print db['two'] + #print db['foo'] + #print db['us'] + #print db.get('ec') + #print db.get('notthere') + db = open('test.cdb', 'wb') + cdb_make(db, + [('one', 'Hello'), + ('two', 'Goodbye'), + ('foo', 'Bar'), + ('us', 'United States'), + ]) + db.close() + db = Cdb(open("test.cdb", 'rb')) + print db['one'] + print db['two'] + print db['foo'] + print db['us'] + print db.get('ec') + print db.get('notthere') + +if __name__ == '__main__': + test() diff --git a/mwlib/cdbwiki.py b/mwlib/cdbwiki.py new file mode 100755 index 0000000..98bb6a7 --- /dev/null +++ b/mwlib/cdbwiki.py @@ -0,0 +1,243 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import sys +import os +import zlib +import re + +from mwlib import cdb + +try: + from xml.etree import cElementTree +except ImportError: + import cElementTree + +ns = '{http://www.mediawiki.org/xml/export-0.3/}' + +wikiindex = "wikiidx" +wikidata = "wikidata.bin" + + + +def normname(name): + name = name.strip().replace("_", " ") + name = name[:1].upper()+name[1:] + return name + +class Tags: + page = ns + 'page' + + # <title> inside <page> + title = ns + 'title' + + # <revision> inside <page> + revision = ns + 'revision' + + # <id> inside <revision> + revid = ns + 'id' + + # <contributor><username> inside <revision> + username = ns + 'contributor/' + ns + 'username' + + # <text> inside <revision> + text = ns + 'text' + + # <timestamp> inside <revision> + timestamp = ns + 'timestamp' + + # <revision><text> inside <page> + revision_text = ns + 'revision/' + ns + 'text' + + siteinfo = ns + "siteinfo" + +class DumpParser(object): + category_ns = set(['category', 'kategorie']) + image_ns = set(['image', 'bild']) + template_ns = set(['template', 'vorlage']) + wikipedia_ns = set(['wikipedia']) + + tags = Tags() + + + def __init__(self, xmlfilename): + self.xmlfilename = xmlfilename + + def _write(self, msg): + sys.stdout.write(msg) + sys.stdout.flush() + + def openInputStream(self): + if self.xmlfilename.lower().endswith(".bz2"): + f = os.popen("bunzip2 -c %s" % self.xmlfilename, "r") + elif self.xmlfilename.lower().endswith(".7z"): + f = os.popen("7z -so x %s" % self.xmlfilename, "r") + else: + f = open(self.xmlfilename, "r") + + return f + + def __call__(self): + f = self.openInputStream() + + count = 0 + for event, elem in cElementTree.iterparse(f): + if elem.tag != self.tags.page: + continue + self.handlePageElement(elem) + elem.clear() + count += 1 + + if count % 5000 == 0: + self._write(" %s\n" % count) + elif count % 100 == 0: + self._write(".") + + + def handlePageElement(self, page): + title = page.find(self.tags.title).text + revisions = page.findall(self.tags.revision) + if not revisions: + return + revision = revisions[-1] + + texttag = revision.find(self.tags.text) + timestamptag = revision.find(self.tags.timestamp) + revision.clear() + + if texttag is not None: + text = texttag.text + texttag.clear() + else: + text = None + + if timestamptag is not None: + timestamp = timestamptag.text + timestamptag.clear() + else: + timestamp = None + + if not text: + return + + if isinstance(title, str): + title = unicode(title) + if isinstance(text, str): + text = unicode(text) + + + if ':' in title: + ns, rest = title.split(':', 1) + ns = ns.lower() + if ns not in self.template_ns: + return + self.handleTemplate(rest, text, timestamp) + else: + self.handleArticle(title, text, timestamp) + + def handleArticle(self, title, text, timestamp): + print "ART:", repr(title), len(text), timestamp + + def handleTemplate(self, title, text, timestamp): + print "TEMPL:", repr(title), len(text), timestamp + +class BuildWiki(DumpParser): + def __init__(self, xmlfilename, outputdir): + DumpParser.__init__(self, xmlfilename) + self.outputdir = outputdir + + def __call__(self): + if not os.path.exists(self.outputdir): + os.makedirs(self.outputdir) + + n = os.path.join(self.outputdir, wikiindex) + out = open(os.path.join(self.outputdir, wikidata), "wb") + self.out = out + f = open(n+'.cdb', 'wb') + c = cdb.CdbMake(f) + self.cdb = c + + DumpParser.__call__(self) + c.finish() + f.close() + + + def _writeobj(self, key, val): + key = key.encode("utf-8") + val = zlib.compress(val) + pos = self.out.tell() + self.out.write(val) + self.cdb.add(key, "%s %s" % (pos, len(val))) + + def handleArticle(self, title, text, timestamp): + self._writeobj(u":"+title, text.encode("utf-8")) + + def handleTemplate(self, title, text, timestamp): + self._writeobj(u"T:"+title, text.encode("utf-8")) + + + +class WikiDB(object): + redirect_rex = re.compile(r'^#Redirect:?\s*?\[\[(?P<redirect>.*?)\]\]', re.IGNORECASE) + + def __init__(self, dir): + self.dir = dir + self.obj2pos_path = os.path.join(self.dir, wikidata) + self.cdb = cdb.Cdb(open(os.path.join(self.dir, wikiindex+'.cdb'), 'rb')) + + def _readobj(self, key): + key = key.encode("utf-8") + + try: + data = self.cdb[key] + except KeyError: + return None + + pos, len = map(int, data.split()) + + f=open(self.obj2pos_path, "rb") + f.seek(pos) + d=f.read(len) + f.close() + return zlib.decompress(d) + + def getRawArticle(self, title, raw=None, revision=None): + title = normname(title) + res = self._readobj(":"+title) + if res is None: + return None + + res = unicode(res, 'utf-8') + mo = self.redirect_rex.search(res) + if mo: + redirect = mo.group('redirect') + redirect = normname(redirect.split("|", 1)[0].split("#", 1)[0]) + + return self.getRawArticle(redirect) + + return res + + def getTemplate(self, title, followRedirects=False): + if ":" in title: + title = title.split(':', 1)[1] + + title = normname(title) + res = unicode(self._readobj(u"T:"+title) or "", 'utf-8') + if not res: + return res + + mo = self.redirect_rex.search(res) + if mo: + redirect = mo.group('redirect') + redirect = normname(redirect.split("|", 1)[0].split("#", 1)[0]) + return self.getTemplate(redirect) + return res + + + def articles(self): + for k, v in self.cdb: + if k[0]==':': + k = unicode(k[1:], "utf-8") + yield k diff --git a/mwlib/dummydb.py b/mwlib/dummydb.py new file mode 100644 index 0000000..e17a90f --- /dev/null +++ b/mwlib/dummydb.py @@ -0,0 +1,10 @@ + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +class DummyDB(object): + def getRawArticle(self, name): + return None + + def getTemplate(self, name, followRedirects=False): + return None diff --git a/mwlib/expander.py b/mwlib/expander.py new file mode 100755 index 0000000..a3a529d --- /dev/null +++ b/mwlib/expander.py @@ -0,0 +1,553 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +from __future__ import with_statement +import sys +import re +import os +from mwlib import magics +import mwlib.log + +DEBUG = "DEBUG_EXPANDER" in os.environ + + +log = mwlib.log.Log("expander") + +splitpattern = """ +({{+) # opening braces +|(}}+) # closing braces +|(\[\[|\]\]) # link +|((?:<noinclude>.*?</noinclude>)|(?:</?includeonly>)) # noinclude, comments: usually ignore +|(?P<text>(?:<nowiki>.*?</nowiki>) # nowiki +|(?:<math>.*?</math>) +|(?:<imagemap[^<>]*>.*?</imagemap>) +|(?:<gallery[^<>]*>.*?</gallery>) +|(?:<source[^<>]*>.*?</source>) +|(?:<pre.*?>.*?</pre>) +|(?:[:\[\]\|{}<]) # all special characters +|(?:[^\[\]\|:{}<]*)) # all others +""" + +splitrx = re.compile(splitpattern, re.VERBOSE | re.DOTALL | re.IGNORECASE) + +onlyincluderx = re.compile("<onlyinclude>(.*?)</onlyinclude>", re.DOTALL | re.IGNORECASE) + +commentrx = re.compile(r"(\n *)?<!--.*?-->( *\n)?", re.DOTALL) + +def remove_comments(txt): + def repl(m): + #print "M:", repr(txt[m.start():m.end()]) + if txt[m.start()]=='\n' and txt[m.end()-1]=='\n': + return '\n' + return (m.group(1) or "")+(m.group(2) or "") + return commentrx.sub(repl, txt) + +def preprocess(txt): + txt=txt.replace("\t", " ") + txt=remove_comments(txt) + return txt + +class symbols: + bra_open = 1 + bra_close = 2 + link = 3 + noi = 4 + txt = 5 + +def old_tokenize(txt): + txt = preprocess(txt) + + if "<onlyinclude>" in txt: + # if onlyinclude tags are used, only use text between those tags. template 'legend' is a example + txt = "".join(onlyincluderx.findall(txt)) + + + tokens = [] + for (v1, v2, v3, v4, v5) in splitrx.findall(txt): + if v5: + tokens.append((5, v5)) + elif v4: + tokens.append((4, v4)) + elif v3: + tokens.append((3, v3)) + elif v2: + tokens.append((2, v2)) + elif v1: + tokens.append((1, v1)) + + tokens.append((None, '')) + + return tokens + + +def new_tokenize(txt): + txt = preprocess(txt) + + import _expander + + if "<onlyinclude>" in txt: + # if onlyinclude tags are used, only use text between those tags. template 'legend' is a example + txt = "".join(onlyincluderx.findall(txt)) + + txt=txt+u'\0' + tokens = _expander.scan(txt) + + res = [] + for t in tokens: + type,start,len=t + if type: + res.append((type, txt[start:start+len])) + else: + res.append((None, '')) + + + return res + +tokenize = old_tokenize + + + +class Node(object): + def __init__(self): + self.children = [] + + def __repr__(self): + return "<%s %s children>" % (self.__class__.__name__, len(self.children)) + + def __iter__(self): + for x in self.children: + yield x + + def show(self, out=None): + show(self, out=out) + +class Variable(Node): + pass + +class Template(Node): + pass + +def show(node, indent=0, out=None): + if out is None: + out=sys.stdout + + out.write("%s%r\n" % (" "*indent, node)) + if isinstance(node, basestring): + return + for x in node.children: + show(x, indent+1, out) + +def optimize(node): + if isinstance(node, basestring): + return node + + if type(node) is Node and len(node.children)==1: + return optimize(node.children[0]) + + for i, x in enumerate(node.children): + node.children[i] = optimize(x) + return node + + +class Parser(object): + template_ns = set([ ((5, u'Plantilla'), (5, u':')), + ]) + + + def __init__(self, txt): + self.txt = txt + self.tokens = tokenize(txt) + self.pos = 0 + + def getToken(self): + return self.tokens[self.pos] + + def setToken(self, tok): + self.tokens[self.pos] = tok + + + def variableFromChildren(self, children): + v=Variable() + name = Node() + v.children.append(name) + + try: + idx = children.index(u"|") + except ValueError: + name.children = children + else: + name.children = children[:idx] + v.children.extend(children[idx+1:]) + return v + + def _eatBrace(self, num): + ty, txt = self.getToken() + assert ty == symbols.bra_close + assert len(txt)>= num + newlen = len(txt)-num + if newlen==0: + self.pos+=1 + return + + if newlen==1: + ty = symbols.txt + + txt = txt[:newlen] + self.setToken((ty, txt)) + + + def templateFromChildren(self, children): + t=Template() + # find the name + name = Node() + t.children.append(name) + for idx, c in enumerate(children): + if c==u'|': + break + name.children.append(c) + + + # find the arguments + + + arg = Node() + + linkcount = 0 + for idx, c in enumerate(children[idx+1:]): + if c==u'[[': + linkcount += 1 + elif c==']]': + linkcount -= 1 + elif c==u'|' and linkcount==0: + t.children.append(arg) + arg = Node() + continue + arg.children.append(c) + + + if arg.children: + t.children.append(arg) + + + return t + + def parseOpenBrace(self): + ty, txt = self.getToken() + n = Node() + + numbraces = len(txt) + self.pos += 1 + + while 1: + ty, txt = self.getToken() + if ty==symbols.bra_open: + n.children.append(self.parseOpenBrace()) + elif ty is None: + break + elif ty==symbols.bra_close: + closelen = len(txt) + if closelen==2 or numbraces==2: + t=self.templateFromChildren(n.children) + n=Node() + n.children.append(t) + self._eatBrace(2) + numbraces-=2 + else: + v=self.variableFromChildren(n.children) + n=Node() + n.children.append(v) + self._eatBrace(3) + numbraces -= 3 + + if numbraces==0: + break + elif numbraces==1: + n.children.insert(0, "{") + break + elif ty==symbols.noi: + self.pos += 1 # ignore <noinclude> + else: # link, txt + n.children.append(txt) + self.pos += 1 + + return n + + def parse(self): + n = Node() + while 1: + ty, txt = self.getToken() + if ty==symbols.bra_open: + n.children.append(self.parseOpenBrace()) + elif ty is None: + break + elif ty==symbols.noi: + self.pos += 1 # ignore <noinclude> + else: # bra_close, link, txt + n.children.append(txt) + self.pos += 1 + return n + +def parse(txt): + return optimize(Parser(txt).parse()) + +class MemoryLimitError(Exception): + pass + +class LazyArgument(object): + def __init__(self, node, expander, variables): + self.node = node + self.expander = expander + self._flatten = None + self.variables = variables + + def flatten(self): + if self._flatten is None: + arg=[] + self.expander.flatten(self.node, arg, self.variables) + + arg = u"".join(arg).strip() + if len(arg)>256*1024: + raise MemoryLimitError("template argument too long: %s bytes" % (len(arg),)) + + self._flatten = arg + return self._flatten + +class ArgumentList(object): + class notfound: pass + + def __init__(self): + self.args = [] + self.namedargs = {} + def __repr__(self): + return "<ARGLIST args=%r>" % ([x.flatten() for x in self.args],) + def append(self, a): + self.args.append(a) + + def get(self, n, default): + return self.__getitem__(n) or default + + def __iter__(self): + for x in self.args: + yield x + + def __getslice__(self, i, j): + for x in self.args[i:j]: + yield x.flatten() + + def __len__(self): + return len(self.args) + + def __getitem__(self, n): + if isinstance(n, (int, long)): + try: + a=self.args[n] + except IndexError: + return u"" + return a.flatten() + + assert isinstance(n, basestring), "expected int or string" + + varcount=1 + if n not in self.namedargs: + for x in self.args: + f=x.flatten() + if u"=" in f: + name, val = f.split(u"=", 1) + name = name.strip() + val = val.strip() + self.namedargs[name] = val + if n==name: + return val + else: + name = str(varcount) + varcount+=1 + self.namedargs[name] = f + + if n==name: + return f + self.namedargs[n] = u'' + + val = self.namedargs[n] + + return val + + +class Expander(object): + def __init__(self, txt, pagename="", wikidb=None): + assert wikidb is not None, "must supply wikidb argument in Expander.__init__" + self.db = wikidb + self.resolver = magics.MagicResolver(pagename=pagename) + self.resolver.wikidb = wikidb + + self.parsed = Parser(txt).parse() + #show(self.parsed) + self.parsedTemplateCache = {} + + self.blacklist = set() + with open("template_blacklist", 'r') as f: + for line in f.readlines(): + self.blacklist.add(line.rstrip()) + + def getParsedTemplate(self, name): + if name.startswith("[["): + return None + + if name.startswith(":"): + log.info("including article") + raw = self.db.getRawArticle(name[1:]) + else: + name = name[0].capitalize() + name[1:] + name = "Plantilla:" + name + try: + return self.parsedTemplateCache[name] + except KeyError: + pass + + # Check to see if this is a template in our blacklist -- + # one that we don't want to bother rendering. + if name in self.blacklist: + log.info("Skipping template " + name.encode('utf8')) + raw = None + else: + raw = self.db.getTemplate(name, True) + + if raw is None: + log.warn("no template", repr(name)) + res = None + else: + # add newline to templates starting with a (semi)colon, or tablemarkup + # XXX what else? see test_implicit_newline in test_expander + if raw.startswith(":") or raw.startswith(";") or raw.startswith("{|"): + raw = '\n'+raw + + log.info("parsing template", repr(name)) + res = Parser(raw).parse() + if DEBUG: + print "TEMPLATE:", name, repr(raw) + res.show() + + self.parsedTemplateCache[name] = res + return res + + + def flatten(self, n, res, variables): + if isinstance(n, Template): + name = [] + self.flatten(n.children[0], name, variables) + name = u"".join(name).strip() + if len(name)>256*1024: + raise MemoryLimitError("template name too long: %s bytes" % (len(name),)) + + remainder = None + if ":" in name: + try_name, try_remainder = name.split(':', 1) + if self.resolver.has_magic(try_name): + name=try_name + remainder = try_remainder + + var = ArgumentList() + + varcount = 1 #unnamed vars + + def args(): + if remainder is not None: + tmpnode=Node() + tmpnode.children.append(remainder) + yield tmpnode + for x in n.children[1:]: + yield x + + for x in args(): + var.append(LazyArgument(x, self, variables)) + + rep = self.resolver(name, var) + + if rep is not None: + res.append(rep) + else: + p = self.getParsedTemplate(name) + if p: + if DEBUG: + msg = "EXPANDING %r %s ===> " % (name, var) + oldidx = len(res) + self.flatten(p, res, var) + + if DEBUG: + msg += "".join(res[oldidx:]) + print msg + + + elif isinstance(n, Variable): + name = [] + self.flatten(n.children[0], name, variables) + name = u"".join(name).strip() + if len(name)>256*1024: + raise MemoryLimitError("template name too long: %s bytes" % (len(name),)) + + v = variables.get(name, None) + + if v is None: + if len(n.children)>1: + self.flatten(n.children[1:], res, variables) + else: + pass + # FIXME. breaks If + #res.append(u"{{{%s}}}" % (name,)) + else: + res.append(v) + else: + for x in n: + if isinstance(x, basestring): + res.append(x) + else: + self.flatten(x, res, variables) + + def expandTemplates(self): + res = [] + self.flatten(self.parsed, res, ArgumentList()) + return u"".join(res) + + +class DictDB(object): + """wikidb implementation used for testing""" + def __init__(self, *args, **kw): + if args: + self.d, = args + else: + self.d = {} + + self.d.update(kw) + + normd = {} + for k, v in self.d.items(): + normd[k.lower()] = v + self.d = normd + + def getRawArticle(self, title): + return self.d[title.lower()] + + def getTemplate(self, title, dummy): + return self.d.get(title.lower(), u"") + +def expandstr(s, expected=None, wikidb=None): + """debug function. expand templates in string s""" + if wikidb: + db = wikidb + else: + db = DictDB(dict(a=s)) + + te = Expander(s, pagename="thispage", wikidb=db) + res = te.expandTemplates() + print "EXPAND: %r -> %r" % (s, res) + if expected: + assert res==expected, "expected %r, got %r" % (expected, res) + return res + +if __name__=="__main__": + #print splitrx.groupindex + d=unicode(open(sys.argv[1]).read(), 'utf8') + e = Expander(d) + print e.expandTemplates() diff --git a/mwlib/expr.py b/mwlib/expr.py new file mode 100755 index 0000000..fa11ce9 --- /dev/null +++ b/mwlib/expr.py @@ -0,0 +1,222 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. +# based on pyparsing example code (SimpleCalc.py) + +"""Implementation of mediawiki's #expr template. +http://meta.wikimedia.org/wiki/ParserFunctions#.23expr: +""" + +from __future__ import division + +import re +import inspect +import math + +class ExprError(Exception): + pass + +def _myround(a,b): + r=round(a, int(b)) + if int(r)==r: + return int(r) + return r + + +pattern = """ +(?:\s+) +|((?:(?:\d+)(?:\.\d+)? + |(?:\.\d+)) (?:e(?:\+|-)?\d+)?) +|(\+|-|\*|/|>=|<=|<>|!=|[a-zA-Z]+|.) +""" + +rxpattern = re.compile(pattern, re.VERBOSE | re.DOTALL | re.IGNORECASE) +def tokenize(s): + res = [] + for (v1,v2) in rxpattern.findall(s): + if not (v1 or v2): + continue + v2=v2.lower() + if v2 in Expr.constants: + res.append((v2,"")) + else: + res.append((v1,v2)) + return res + + return [(v1,v2.lower()) for (v1,v2) in rxpattern.findall(s) if v1 or v2] + +class uminus: pass +class uplus: pass + +precedence = {"(":-1, ")":-1} +functions = {} + +def addop(op, prec, fun, numargs=None): + precedence[op] = prec + if numargs is None: + numargs = len(inspect.getargspec(fun)[0]) + + + def wrap(stack): + assert len(stack)>=numargs + args = tuple(stack[-numargs:]) + del stack[-numargs:] + stack.append(fun(*args)) + + functions[op] = wrap + +a=addop +a(uminus, 10, lambda x: -x) +a(uplus, 10, lambda x: x) +a("^", 10, math.pow, 2) +a("not", 9, lambda x:int(not(bool(x)))) +a("abs", 9, abs, 1) +a("sin", 9, math.sin, 1) +a("cos", 9, math.cos, 1) +a("asin", 9, math.asin, 1) +a("acos", 9, math.acos, 1) +a("tan", 9, math.tan, 1) +a("atan", 9, math.atan, 1) +a("exp", 9, math.exp, 1) +a("ln", 9, math.log, 1) +a("ceil", 9, lambda x: int(math.ceil(x))) +a("floor", 9, lambda x: int(math.floor(x))) +a("trunc", 9, long, 1) + +a("*", 8, lambda x,y: x*y) +a("/", 8, lambda x,y: x/y) +a("div", 8, lambda x,y: x/y) +a("mod", 8, lambda x,y: int(x)%int(y)) + + +a("+", 6, lambda x,y: x+y) +a("-", 6, lambda x,y: x-y) + +a("round", 5, _myround) + +a("<", 4, lambda x,y: int(x<y)) +a(">", 4, lambda x,y: int(x>y)) +a("<=", 4, lambda x,y: int(x<=y)) +a(">=", 4, lambda x,y: int(x>=y)) +a("!=", 4, lambda x,y: int(x!=y)) +a("<>", 4, lambda x,y: int(x!=y)) +a("=", 4, lambda x,y: int(x==y)) + +a("and", 3, lambda x,y: int(bool(x) and bool(y))) +a("or", 2, lambda x,y: int(bool(x) or bool(y))) +del a + +class Expr(object): + constants = dict( + e=math.e, + pi=math.pi) + + def as_float_or_int(self, s): + try: + return self.constants[s] + except KeyError: + pass + + if "." in s or "e" in s.lower(): + return float(s) + return long(s) + + def output_operator(self, op): + return functions[op](self.operand_stack) + + def output_operand(self, operand): + self.operand_stack.append(operand) + + def parse_expr(self, s): + tokens = tokenize(s) + if not tokens: + return "" + + self.operand_stack = [] + operator_stack = [] + + seen_operand=False + + last_operand, last_operator = False, True + + for operand, operator in tokens: + if operand: + if last_operand: + raise ExprError("expected operator") + self.output_operand(self.as_float_or_int(operand)) + elif operator=="(": + operator_stack.append("(") + elif operator==")": + while 1: + if not operator_stack: + raise ExprError("unbalanced parenthesis") + t = operator_stack.pop() + if t=="(": + break + self.output_operator(t) + elif operator in precedence: + if last_operator and last_operator!=")": + if operator=='-': + operator = uminus + elif operator=='+': + operator = uplus + + is_unary = operator in (uplus, uminus) + prec = precedence[operator] + while not is_unary and operator_stack and prec<=precedence[operator_stack[-1]]: + p = operator_stack.pop() + self.output_operator(p) + operator_stack.append(operator) + else: + raise ExprError("unknown operator: %r" % (operator,)) + + last_operand, last_operator = operand, operator + + + while operator_stack: + p=operator_stack.pop() + if p=="(": + raise ExprError("unbalanced parenthesis") + self.output_operator(p) + + if len(self.operand_stack)!=1: + raise ExprError("bad stack: %s" % (self.operand_stack,)) + + return self.operand_stack[-1] + +def expr(s): + return Expr().parse_expr(s) + +def main(): + ParseException = ExprError + import time + try: + import readline # do not remove. makes raw_input use readline + readline + except ImportError: + pass + + ep = expr + + while 1: + input_string = raw_input("> ") + if not input_string: + continue + + stime = time.time() + try: + res=expr(input_string) + except Exception, err: + print "ERROR:", err + import traceback + traceback.print_exc() + + continue + print res + print time.time()-stime, "s" + +if __name__=='__main__': + main() + + diff --git a/mwlib/htmlwriter.py b/mwlib/htmlwriter.py new file mode 100755 index 0000000..dabb979 --- /dev/null +++ b/mwlib/htmlwriter.py @@ -0,0 +1,436 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import os +from mwlib import parser, rendermath, timeline + +import urllib +import cgi + +#from PIL import Image + +from mwlib.log import Log + +log = Log("htmlwriter") + +class HTMLWriter(object): + imglevel = 0 + namedLinkCount = 1 + def __init__(self, out, images=None, math_renderer=None): + self.out = out + self.level = 0 + self.images = images + # self.images = imgdb.ImageDB(os.path.expanduser("~/images")) + self.references = [] + if math_renderer is None: + self.math_renderer = rendermath.Renderer() + else: + self.math_renderer = math_renderer + + def _write(self, s): + self.out.write(cgi.escape(s).encode('utf8')) + + def getCategoryList(self, obj): + categories = list(set(c.target for c in obj.find(parser.CategoryLink))) + categories.sort() + return categories + + def write(self, obj): + m = "write" + obj.__class__.__name__ + m=getattr(self, m) + m(obj) + + def ignore(self, obj): + pass + + def serializeVList(self,vlist): + args = [] + styleArgs = [] + gotClass = 0 + gotExtraClass = 0 + for (key,value) in vlist.items(): + if isinstance(value, (basestring, int)): + if key=="class": + args.append('%s="%s"' % (key, value)) + gotClass = 1 + else: + args.append('%s="%s"' % (key, value)) + if isinstance(value, dict) and key=="style": + for (_key,_value) in value.items(): + styleArgs.append("%s:%s" % (_key, _value)) + args.append(' style="%s"' % ';'.join(styleArgs)) + gotExtraClass = 1 + return ' '.join(args) + + + def writeMagic(self, m): + if m.values.get('html'): + for x in m.children: + self.write(x) + + def writeCaption(self, obj): + # todo- A table contained a Caption node, causing an exception in write. + # Not sure what the HTML should be, if any. + pass + + def writeSection(self, obj): + header = "h%s" % (obj.level) + self.out.write("<%s>" % header) + self.write(obj.children[0]) + self.out.write("</%s>" % header) + + self.level += 1 + for x in obj.children[1:]: + self.write(x) + self.level -= 1 + + def writePreFormatted(self, n): + self.out.write("<pre>") + for x in n: + self.write(x) + self.out.write("</pre>") + + def writeNode(self, n): + for x in n: + self.write(x) + + def writeCell(self, cell): + svl = "" + if cell.vlist: + svl = self.serializeVList(cell.vlist) + + self.out.write('<td %s>' % svl) + for x in cell: + self.write(x) + self.out.write("</td>") + + def writeTagNode(self, t): + if t.caption == 'ref': + self.references.append(t) + self.out.write("<sup>%s</sup>" % len(self.references)) + return + elif t.caption == 'references': + if not self.references: + return + + self.out.write("<ol>") + for r in self.references: + self.out.write("<li>") + for x in r: + self.write(x) + self.out.write("</li>") + self.out.write("</ol>") + + self.references = [] + return + elif t.caption=='imagemap': + # FIXME. this is not complete. t.imagemap.entries should also be handled. + print "WRITEIMAGEMAP:", t.imagemap + if t.imagemap.imagelink: + self.write(t.imagemap.imagelink) + return + + + self.out.write(t.starttext.encode('utf8')) + for x in t: + self.write(x) + self.out.write(t.endtext.encode('utf8')) + + def writeRow(self, row): + self.out.write('<tr>') + for x in row: + self.write(x) + + self.out.write('</tr>') + + def writeTable(self, t): + svl = "" + if t.vlist: + svl = self.serializeVList(t.vlist) + + + + self.out.write("<table %s>" % svl) + if t.caption: + self.out.write("<caption>") + self.write(t.caption) + self.out.write("<caption>") + for x in t: + self.write(x) + self.out.write("</table>") + + def writeMath(self, obj): + latex = obj.caption + #p = self.math_renderer.render(latex) + self.out.write('<tt>%s</tt>' % latex) + + def writeURL(self, obj): + self.out.write('<a href="%s" class="offsite" ttid="externallink">' % obj.caption) + if obj.children: + for x in obj.children: + self.write(x) + else: + self.out.write(obj.caption) + + self.out.write(' <img src="/static/outgoing_link.gif" /></a>') + + def writeNamedURL(self, obj): + self.out.write('<a href="%s" class="offsite" ttid="externallink">' % obj.caption) + if obj.children: + for x in obj.children: + self.write(x) + else: + name = "[%s]" % self.namedLinkCount + self.namedLinkCount += 1 + self.out.write(name) + + self.out.write(' <img src="/static/outgoing_link.gif" /></a>') + + + def writeParagraph(self, obj): + self.out.write("\n<p>") + for x in obj: + self.write(x) + self.out.write("</p>\n") + + def getHREF(self, obj): + parts = obj.target.encode('utf-8').split('#') + parts[0] = parts[0].replace(" ", "_") + + + return '../%s/' % ("#".join([urllib.quote(x) for x in parts])) + + writeLangLink = ignore + + def writeLink(self, obj): + if obj.target is None: + return + + href = self.getHREF(obj) + if href is not None: + self.out.write('<a href="%s" class="normallink">' % (href,)) + else: + self.out.write('<a class="deadlink">') + if obj.children: + for x in obj.children: + self.write(x) + else: + self._write(obj.target) + + self.out.write("</a>") + + def writeSpecialLink(self, obj): + if obj.children: + for x in obj.children: + self.write(x) + else: + self._write(obj.target) + + def writeCategoryLink(self, obj): + if obj.colon: + if obj.children: + for x in obj.children: + self.write(x) + else: + self._write(obj.target) + + def writeTimeline(self, obj): + img = timeline.drawTimeline(obj.caption) + if img is None: + return + + target = "/timeline/"+os.path.basename(img) + width, height = Image.open(img).size + + self.out.write('<img src="%s" width="%s" height="%s" />' % (target, width, height)) + + def writeImageLink(self, obj): + """ + <span class='image'> + <span class='left'> + <img src='bla' /> + <span class='imagecaption'>bla bla</span> + <span/> + <span/> + """ + + if self.images is None: + return + + width = obj.width + height = obj.height + + #if not width: + # width = 400 # what could be a sensible default if no width is given? maybe better 0? + + if width: + path = self.images.getPath(obj.target, size=max(width, height)) + url = self.images.getURL(obj.target, size=max(width, height)) + else: + path = self.images.getPath(obj.target) + url = self.images.getURL(obj.target) + + if url is None: + return + + if isinstance(path, str): + path = unicode(path, 'utf8') + + if self.imglevel==0: + self.imglevel += 1 + + # WTB: Added the ability to not specify width & height since images may not be found locally. + # This may have to be redone eventually, perhaps we need a database of image dimensions, + # but I doubt it. Besides, more hardcoded pathnames in 'getimg'? + try: + def getimg(): + return Image.open(path) + img = None + + if not width: + if not img: + img = getimg() + size = img.size + width = min(400, size[0]) + + if not height: + if not img: + img = getimg() + size = img.size + height = size[1]*width/size[0] + except IOError, err: + log.warn("Image.open failed:", err, "path=", repr(path)) + # WTB: Removed following return as images will not always be found locally. + #self.imglevel -= 1 + #return + + attr = '' + attr_css = '' + + if width: + attr += "width='%d' " % width + attr_css += "width:%dpx " % width + + if height: + attr += "height='%d' " % height + # WTB: Note- height not applied to CSS. + + if obj.isInline(): + self.out.write('<img src="%s" %s/>' % (url.encode("utf8"), attr.encode("utf8"))) + else: + # WTB: This looked like a mistake to me, it was modifying obj.align instead of align. + # This function should not modify obj at all. + align = obj.align + if obj.thumb == True and not align: + align = "clear right" + self.out.write('''<div class="bbotstyle image %s" style="%s">'''% (align, attr_css)) + self.out.write('<img src="%s" %s/>' % (url.encode("utf8"), attr.encode("utf8"))) + + self.out.write('<span class="imagecaption">') + for x in obj.children: + self.write(x) + self.out.write('</span></div>') + self.imglevel -= 1 + else: + self.out.write('<a href="%s">' % url) + for x in obj.children: + self.write(x) + self.out.write('</a>') + + def writeText(self, t): + #self.out.write(cgi.escape(t.caption).encode('ascii', 'xmlcharrefreplace')) + self._write(t.caption) + + writeControl = writeText + + def writeArticle(self, a): + if a.caption: + self.out.write("<h1>") + self._write(a.caption) + self.out.write(' <font size=1>· <a class="offsite" ') + self.out.write('href="http://es.wikipedia.org/wiki/') + self._write(a.caption) + self.out.write('">De Wikipedia, la enciclopedia libre</a>') + self.out.write("</font>") + self.out.write('</h1>') + + for x in a: + self.write(x) + + self.out.write("\n<br/>") + + def writeStyle(self, s): + if s.caption == "''": + tag = 'em' + elif s.caption=="'''''": + self.out.write("<strong><em>") + for x in s: + self.write(x) + self.out.write("</em></strong>") + return + elif s.caption == "'''": + tag = 'strong' + elif s.caption == ";": + self.out.write("<div><strong>") + for x in s: + self.write(x) + self.out.write("</strong></div>") + return + + elif s.caption.startswith(":"): + self.out.write("<blockquote>"*len(s.caption)) + for x in s: + self.write(x) + self.out.write("</blockquote>"*len(s.caption)) + return + elif s.caption == "overline": + self.out.write('<u style="text-decoration: overline;">') + for x in s: + self.write(x) + self.out.write('</u>') + return + else: + tag = s.caption + + + self.out.write("<%s>" % tag) + for x in s: + self.write(x) + self.out.write("</%s>" % tag) + + def writeItem(self, item): + self.out.write("<li>") + for x in item: + self.write(x) + self.out.write("</li>\n") + + def writeItemList(self, lst): + if lst.numbered: + tag = "ol" + else: + tag = "ul" + + self.out.write("<%s>" % tag) + + for x in lst: + self.write(x) + self.out.write("\n") + + self.out.write("</%s>" % tag) + + +class NoLinksWriter(HTMLWriter): + """Subclass that ignores (non-outgoing) links""" + + def writeLink(self, obj): + if obj.target is None: + return + + if obj.children: + for x in obj.children: + self.write(x) + else: + self._write(obj.target) + diff --git a/mwlib/imgmap.py b/mwlib/imgmap.py new file mode 100755 index 0000000..80bb826 --- /dev/null +++ b/mwlib/imgmap.py @@ -0,0 +1,122 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +from pyparsing import (Literal, restOfLine, Word, nums, Group, + ZeroOrMore, OneOrMore, And, Suppress, LineStart, + LineEnd, StringEnd, ParseException, Optional, White) + +class gob(object): + def __init__(self, **kw): + self.__dict__.update(kw) + + def __repr__(self): + return "<%s %r>" % (self.__class__.__name__, self.__dict__) + +class Poly(gob): pass +class Rect(gob): pass +class Circle(gob): pass +class Comment(gob): pass +class Desc(gob): pass +class Default(gob): pass +class ImageMap(gob): pass + +def _makepoly(tokens): + return Poly(caption=tokens[2].strip(), vertices=list(tokens[1])) + +def _makerect(tokens): + return Rect(caption=tokens[-1].strip(), top_left=tuple(tokens[1]), bottom_right=tuple(tokens[2])) + +def _makecomment(tokens): + return Comment(comment=tokens[1]) + +def _makecircle(tokens): + return Circle(caption=tokens[3].strip(), center=tokens[1], radius=tokens[2]) + +def _makedesc(tokens): + return Desc(location=tokens[1]) + +def _makeimagemap(tokens): + image = None + for x in tokens: + if isinstance(x, basestring): + image = x + break + return ImageMap(entries=list(tokens), image=image) + + +comment = (Literal('#')+restOfLine).setParseAction(_makecomment) + +integer = Word(nums).setParseAction(lambda s: int(s[0])) +integer_pair = (integer+integer).setParseAction(lambda x: tuple(x)) + +poly = Literal("poly")+Group(ZeroOrMore(integer_pair))+restOfLine +poly = poly.setParseAction(_makepoly) + +rect = Literal("rect")+integer_pair+integer_pair+restOfLine +rect = rect.setParseAction(_makerect) + +circle = Literal("circle")+integer_pair+integer+restOfLine +circle = circle.setParseAction(_makecircle) + +desc = Literal("desc") + (Literal("top-right") + |Literal("bottom-right") + |Literal("bottom-left") + |Literal("top-left") + |Literal("none")) +desc = desc.setParseAction(_makedesc) +default = Literal("default")+restOfLine +default.setParseAction(lambda t: Default(caption=t[1].strip())) + + +def _makeother(tokens): + if not tokens[0]: + return [None] + return tokens + +# we can't use restOfLine.setParseAction(_makeother) as that sets the +# parse action for any occurence of restOfLine + +other = And([restOfLine]).setParseAction(_makeother) +line = Suppress(LineStart()) + (comment | poly | rect | circle | desc | default | other) + Suppress(LineEnd()) +imagemap = ZeroOrMore(line) + StringEnd() +imagemap.setParseAction(_makeimagemap) + +def ImageMapFromString(s): + # uhh. damn. can't get pyparsing to parse + # commands, other lines (i.e. syntax errors strictly speaking) + # and lines containing only whitespace... + lines = [] + for x in s.split("\n"): + x=x.strip() + if x: + lines.append(x) + s="\n".join(lines) + + try: + return imagemap.parseString(s)[0] + except ParseException, err: + return ImageMap(entries=[], image=None) + +def main(): + ex=""" + + +Image:Foo.jpg|200px|picture of a foo +poly 131 45 213 41 210 110 127 109 [[Display]] +poly 104 126 105 171 269 162 267 124 [[Keyboard]] +rect 15 95 94 176 [[Foo type A]] +# A comment, this line is ignored +circle 57 57 20 [[Foo type B]] +desc bottom-left +default [[Mainz]] +---dfg-sdfg--sdfg +blubb +""" + res = ImageMapFromString(ex) + for x in res.entries: + print x + +if __name__=='__main__': + main() diff --git a/mwlib/lang.py b/mwlib/lang.py new file mode 100755 index 0000000..ca122a0 --- /dev/null +++ b/mwlib/lang.py @@ -0,0 +1,10 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import os +languages = set(open(os.path.join(os.path.dirname(__file__), 'lang.txt')).read().split()) + + + diff --git a/mwlib/lang.txt b/mwlib/lang.txt new file mode 100644 index 0000000..9dfb78e --- /dev/null +++ b/mwlib/lang.txt @@ -0,0 +1,253 @@ +af +als +am +an +ang +ar +ast +az +ba +be +bg +bm +bn +bo +br +bs +ca +ceb +chr +co +cs +csb +cv +cy +da +de +el +en +eo +es +et +eu +fa +fi +fiu-vro +fo +fr +frp +fur +fy +ga +gd +gl +gn +gu +he +hi +hr +ht +hu +hy +ia +id +ie +ilo +io +is +it +iu +ja +jbo +jv +ka +kg +km +kn +ko +ks +ksh +ku +kw +ky +la +lad +lb +li +ln +lt +lv +mg +mi +mk +ml +mo +mr +ms +mt +my +na +nah +nap +nb +nds +nds-nl +ng +nl +nn +no +nrm +nv +oc +os +pam +pap +pdc +pl +ps +pt +rm +ro +roa-rup +ru +sa +sc +scn +sco +se +sh +si +simple +sk +sl +sm +so +sq +sr +st +su +sv +sw +ta +te +th +tk +tl +to +tpi +tr +tt +ug +uk +ur +vec +vi +vo +wa +war +yi +za +zh +zh-min-nan +zh-yue +pms +dv +got +haw +wo +tet +qu +lmo +be-x-old +hsb +pag +bat-smg +bpy +lij +udm +nov +cbk-zam +arc +kab +ru-sib +diq +gv +zea +wuu +cdo +lg +hak +ty +lo +tn +ti +tg +dk +yo +dz +vls +bar +eml +bxr +ee +rmy +eve +zu +rw +new +rn +xal +bh +bi +wp +om +glk +tw +or +aa +xh +ch +ce +cr +fj +cu +zh-tw +pa +chy +pi +hz +ho +bug +uz +mn +ik +ss +kj +bet +pih +ab +ve +ak +ii +as +av +ay +ig +nan +zh-min-nan +ne +ny +sn +ff +mh +mzn +kk +ki +kl +kv +sg +sd +roa-tara +zh-classical +zh-cn +map-bms diff --git a/mwlib/licenses.py b/mwlib/licenses.py new file mode 100644 index 0000000..2e96efd --- /dev/null +++ b/mwlib/licenses.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python + +"""Mapping of lower-cased template names of licenses to their normalized name. +This file has been automatically generated with tools/get_license_templates.py +""" + +lower2normal = {u'attr-tartu': u'Attr-Tartu', + u'attribution': u'Attribution', + u'attribution entomart': u'Attribution Entomart', + u'attribution-ubisoft': u'Attribution-Ubisoft', + u'attribution-ubisoft/ja': u'Attribution-Ubisoft/ja', + u'attribution-ubisoft/ko': u'Attribution-Ubisoft/ko', + u'attribution-ubisoft/nl': u'Attribution-Ubisoft/nl', + u'attribution/lv': u'Attribution/lv', + u'attribution/zh-hant': u'Attribution/zh-hant', + u'autotravel': u'Autotravel', + u'bsdu': u'BSDu', + u'cc-by-1.0': u'Cc-by-1.0', + u'cc-by-1.0-nl': u'Cc-by-1.0-nl', + u'cc-by-2.0': u'Cc-by-2.0', + u'cc-by-2.0-be': u'Cc-by-2.0-be', + u'cc-by-2.0-br': u'Cc-by-2.0-br', + u'cc-by-2.0-cl': u'Cc-by-2.0-cl', + u'cc-by-2.0-es': u'Cc-by-2.0-es', + u'cc-by-2.0-fr': u'Cc-by-2.0-fr', + u'cc-by-2.0-it': u'Cc-by-2.0-it', + u'cc-by-2.0-kr': u'Cc-by-2.0-kr', + u'cc-by-2.0-nl': u'Cc-by-2.0-nl', + u'cc-by-2.0-uk': u'Cc-by-2.0-uk', + u'cc-by-2.1-au': u'Cc-by-2.1-au', + u'cc-by-2.1-es': u'Cc-by-2.1-es', + u'cc-by-2.1-jp': u'Cc-by-2.1-jp', + u'cc-by-2.5': u'Cc-by-2.5', + u'cc-by-2.5-bg': u'Cc-by-2.5-bg', + u'cc-by-2.5-br': u'Cc-by-2.5-br', + u'cc-by-2.5-dk': u'Cc-by-2.5-dk', + u'cc-by-2.5-in': u'Cc-by-2.5-in', + u'cc-by-2.5-it': u'Cc-by-2.5-it', + u'cc-by-2.5-my': u'Cc-by-2.5-my', + u'cc-by-2.5-nl': u'Cc-by-2.5-nl', + u'cc-by-2.5-pl': u'Cc-by-2.5-pl', + u'cc-by-2.5-se': u'Cc-by-2.5-se', + u'cc-by-3.0': u'Cc-by-3.0', + u'cc-by-3.0-gr': u'Cc-by-3.0-gr', + u'cc-by-3.0-indiafm': u'Cc-by-3.0-IndiaFM', + u'cc-by-3.0-nl': u'Cc-by-3.0-nl', + u'cc-by-3.0-rs': u'Cc-by-3.0-rs', + u'cc-by-3.0-us': u'Cc-by-3.0-us', + u'cc-by-nc-sa-2.0-dual': u'Cc-by-nc-sa-2.0-dual', + u'cc-by-sa-1.0': u'Cc-by-sa-1.0', + u'cc-by-sa-1.0-fi': u'Cc-by-sa-1.0-fi', + u'cc-by-sa-1.0-tw': u'Cc-by-sa-1.0-tw', + u'cc-by-sa-2.0': u'Cc-by-sa-2.0', + u'cc-by-sa-2.0-at': u'Cc-by-sa-2.0-at', + u'cc-by-sa-2.0-be': u'Cc-by-sa-2.0-be', + u'cc-by-sa-2.0-br': u'Cc-by-sa-2.0-br', + u'cc-by-sa-2.0-ca': u'Cc-by-sa-2.0-ca', + u'cc-by-sa-2.0-cl': u'Cc-by-sa-2.0-cl', + u'cc-by-sa-2.0-de': u'Cc-by-sa-2.0-de', + u'cc-by-sa-2.0-es': u'Cc-by-sa-2.0-es', + u'cc-by-sa-2.0-fr': u'Cc-by-sa-2.0-fr', + u'cc-by-sa-2.0-it': u'Cc-by-sa-2.0-it', + u'cc-by-sa-2.0-kr': u'Cc-by-sa-2.0-kr', + u'cc-by-sa-2.0-nl': u'Cc-by-sa-2.0-nl', + u'cc-by-sa-2.0-tw': u'Cc-by-sa-2.0-tw', + u'cc-by-sa-2.0-uk': u'Cc-by-sa-2.0-uk', + u'cc-by-sa-2.1-au': u'Cc-by-sa-2.1-au', + u'cc-by-sa-2.1-es': u'Cc-by-sa-2.1-es', + u'cc-by-sa-2.1-jp': u'Cc-by-sa-2.1-jp', + u'cc-by-sa-2.5': u'Cc-by-sa-2.5', + u'cc-by-sa-2.5,1.0': u'Cc-by-sa-2.5,1.0', + u'cc-by-sa-2.5,2.0,1.0': u'Cc-by-sa-2.5,2.0,1.0', + u'cc-by-sa-2.5,2.0,1.0-no-link': u'Cc-by-sa-2.5,2.0,1.0-no-link', + u'cc-by-sa-2.5-ar': u'Cc-by-sa-2.5-ar', + u'cc-by-sa-2.5-au': u'Cc-by-sa-2.5-au', + u'cc-by-sa-2.5-bg': u'Cc-by-sa-2.5-bg', + u'cc-by-sa-2.5-br': u'Cc-by-sa-2.5-br', + u'cc-by-sa-2.5-ca': u'Cc-by-sa-2.5-ca', + u'cc-by-sa-2.5-ch': u'Cc-by-sa-2.5-ch', + u'cc-by-sa-2.5-cl': u'Cc-by-sa-2.5-cl', + u'cc-by-sa-2.5-cn': u'Cc-by-sa-2.5-cn', + u'cc-by-sa-2.5-de': u'Cc-by-sa-2.5-de', + u'cc-by-sa-2.5-dk': u'Cc-by-sa-2.5-dk', + u'cc-by-sa-2.5-es': u'Cc-by-sa-2.5-es', + u'cc-by-sa-2.5-hu': u'Cc-by-sa-2.5-hu', + u'cc-by-sa-2.5-in': u'Cc-by-sa-2.5-in', + u'cc-by-sa-2.5-it': u'Cc-by-sa-2.5-it', + u'cc-by-sa-2.5-mx': u'Cc-by-sa-2.5-mx', + u'cc-by-sa-2.5-nl': u'Cc-by-sa-2.5-nl', + u'cc-by-sa-2.5-pl': u'Cc-by-sa-2.5-pl', + u'cc-by-sa-2.5-pt': u'Cc-by-sa-2.5-pt', + u'cc-by-sa-2.5-se': u'Cc-by-sa-2.5-se', + u'cc-by-sa-2.5-si': u'Cc-by-sa-2.5-si', + u'cc-by-sa-2.5-tw': u'Cc-by-sa-2.5-tw', + u'cc-by-sa-3.0': u'Cc-by-sa-3.0', + u'cc-by-sa-3.0,2.5,2.0,1.0': u'Cc-by-sa-3.0,2.5,2.0,1.0', + u'cc-by-sa-3.0,2.5,2.0,1.0-no-link': u'Cc-by-sa-3.0,2.5,2.0,1.0-no-link', + u'cc-by-sa-3.0-gr': u'Cc-by-sa-3.0-gr', + u'cc-by-sa-3.0-nl': u'Cc-by-sa-3.0-nl', + u'cc-by-sa-3.0-rs': u'Cc-by-sa-3.0-rs', + u'cc-by-sa-3.0-tw': u'Cc-by-sa-3.0-tw', + u'cc-by-sa-3.0-us': u'Cc-by-sa-3.0-us', + u'cc-by-sa-jul': u'Cc-by-sa-jul', + u'cecill': u'CeCILL', + u'cng': u'CNG', + u'elephants dream': u'Elephants Dream', + u'fal': u'FAL', + u'geograph': u'Geograph', + u'gfdl': u'GFDL', + u'gfdl or cc-by-nc-sa': u'GFDL or cc-by-nc-sa', + u'gfdl or cc-by-nc-sa/2.5': u'GFDL or cc-by-nc-sa/2.5', + u'gfdl-1.2': u'GFDL-1.2', + u'gfdl-1.2-en': u'GFDL-1.2-en', + u'gfdl-1.2/es': u'GFDL-1.2/es', + u'gfdl-1.2/vi': u'GFDL-1.2/vi', + u'gfdl-cc-triple': u'GFDL-CC-triple', + u'gfdl-dd': u'GFDL-DD', + u'gfdl-en': u'GFDL-en', + u'gfdl-en/bg': u'GFDL-en/bg', + u'gfdl-en/fr': u'GFDL-en/fr', + u'gfdl-en/pl': u'GFDL-en/pl', + u'gfdl-gmt': u'GFDL-GMT', + u'gfdl-is': u'GFDL-IS', + u'gfdl-it': u'GFDL-it', + u'gfdl-ja': u'GFDL-ja', + u'gfdl-landsat-kashmir3d': u'GFDL-Landsat-Kashmir3d', + u'gfdl-opengeodb': u'GFDL-OpenGeoDB', + u'gfdl-retouched': u'GFDL-retouched', + u'gfdl-samoborac': u'GFDL-Samoborac', + u'gfdl-self': u'GFDL-self', + u'gfdl-user': u'GFDL-user', + u'gfdl-user-als': u'GFDL-user-als', + u'gfdl-user-ar': u'GFDL-user-ar', + u'gfdl-user-bat-smg': u'GFDL-user-bat-smg', + u'gfdl-user-bs': u'GFDL-user-bs', + u'gfdl-user-cs': u'GFDL-user-cs', + u'gfdl-user-da': u'GFDL-user-da', + u'gfdl-user-de': u'GFDL-user-de', + u'gfdl-user-el': u'GFDL-user-el', + u'gfdl-user-en-no-disclaimers': u'GFDL-user-en-no-disclaimers', + u'gfdl-user-en-note': u'GFDL-user-en-note', + u'gfdl-user-en-with-disclaimers': u'GFDL-user-en-with-disclaimers', + u'gfdl-user-es': u'GFDL-user-es', + u'gfdl-user-fa': u'GFDL-user-fa', + u'gfdl-user-fi': u'GFDL-user-fi', + u'gfdl-user-fr': u'GFDL-user-fr', + u'gfdl-user-gl': u'GFDL-user-gl', + u'gfdl-user-he': u'GFDL-user-he', + u'gfdl-user-hi': u'GFDL-user-hi', + u'gfdl-user-hu': u'GFDL-user-hu', + u'gfdl-user-id': u'GFDL-user-id', + u'gfdl-user-it': u'GFDL-user-it', + u'gfdl-user-ja': u'GFDL-user-ja', + u'gfdl-user-ko': u'GFDL-user-ko', + u'gfdl-user-lt': u'GFDL-user-lt', + u'gfdl-user-nl': u'GFDL-user-nl', + u'gfdl-user-nn': u'GFDL-user-nn', + u'gfdl-user-no': u'GFDL-user-no', + u'gfdl-user-pl': u'GFDL-user-pl', + u'gfdl-user-pt': u'GFDL-user-pt', + u'gfdl-user-ru': u'GFDL-user-ru', + u'gfdl-user-sk': u'GFDL-user-sk', + u'gfdl-user-sq': u'GFDL-user-sq', + u'gfdl-user-tr': u'GFDL-user-tr', + u'gfdl-user-uk': u'GFDL-user-uk', + u'gfdl-user-vi': u'GFDL-user-vi', + u'gfdl-user-vls': u'GFDL-user-vls', + u'gfdl-user-w': u'GFDL-user-w', + u'gfdl-user-zh': u'GFDL-user-zh', + u'gpl': u'GPL', + u'gplv2 only': u'GPLv2 only', + u'gplv3': u'GPLv3', + u'inewton': u'INewton', + u'lgpl': u'LGPL', + u'mdb': u'MdB', + u'met.no': u'Met.no', + u'norges golfforbund': u'Norges Golfforbund', + u'open font': u'Open Font', + u'parlament.ch': u'Parlament.ch', + u'picswiss': u'Picswiss', + u'polishsenatecopyright': u'PolishSenateCopyright', + u'stationsweb': u'Stationsweb', + u'statistics netherlands map': u'Statistics Netherlands map', + u'swiss government portrait': u'Swiss Government Portrait', + u'www.nordenskirker.dk': u'Www.nordenskirker.dk'} diff --git a/mwlib/log.py b/mwlib/log.py new file mode 100755 index 0000000..9afef8f --- /dev/null +++ b/mwlib/log.py @@ -0,0 +1,53 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import sys + +class Stdout(object): + """late-bound sys.stdout""" + def write(self, msg): + sys.stdout.write(msg) + + def flush(self): + sys.stdout.flush() + +class Stderr(object): + """late-bound sys.stderr""" + def write(self, msg): + sys.stderr.write(msg) + + def flush(self): + sys.stderr.flush() + +class Log(object): + logfile = Stderr() + + def __init__(self, prefix=None): + if prefix is None: + self._prefix = [] + else: + if isinstance(prefix, basestring): + self._prefix = [prefix] + else: + self._prefix = prefix + + def __getattr__(self, name): + return Log([self, name]) + + def __nonzero__(self): + return bool(self._prefix) + + def __str__(self): + return ".".join(str(x) for x in self._prefix if x) + + def __call__(self, msg, *args): + if not self.logfile: + return + + if args: + msg = " ".join(([msg] + [repr(x) for x in args])) + + s = "%s >> %s\n" % (".".join(str(x) for x in self._prefix if x), msg) + self.logfile.write(s) diff --git a/mwlib/magics.py b/mwlib/magics.py new file mode 100755 index 0000000..4246ba5 --- /dev/null +++ b/mwlib/magics.py @@ -0,0 +1,469 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +"""expand magic variables/colon functions +http://meta.wikimedia.org/wiki/Help:Colon_function +http://meta.wikimedia.org/wiki/Help:Magic_words +http://meta.wikimedia.org/wiki/ParserFunctions +""" + +import datetime +import urllib +from mwlib.log import Log +from mwlib import expr + +log = Log("expander") + +def singlearg(fun): + def wrap(self, args): + rl=args + if not rl: + a=u'' + else: + a=rl[0] + + return fun(self, a) + + return wrap + +def noarg(fun): + def wrap(self, *args): + return fun(self) + return wrap + +def as_numeric(x): + try: + return int(x) + except ValueError: + pass + return float(x) + + +def maybe_numeric_compare(a,b): + if a==b: + return True + try: + a=as_numeric(a) + b=as_numeric(b) + except ValueError: + return False + + return a==b + + +class OtherMagic(object): + def DEFAULTSORT(self, args): + """see http://en.wikipedia.org/wiki/Template:DEFAULTSORT""" + return u"" + + +class TimeMagic(object): + now = datetime.datetime.now() + + @noarg + def CURRENTDAY(self): + """Displays the current day in numeric form.""" + return "%s" % self.now.day + + @noarg + def CURRENTDAY2(self): + """[MW1.5+] Ditto with leading zero 01 .. 31).""" + return "%02d" % self.now.day + + @noarg + def CURRENTDAYNAME(self): + """Displays the current day in named form.""" + return self.now.strftime("%A") + + @noarg + def CURRENTDOW(self): + """current day as number (0=Sunday, 1=Monday...).""" + return str((self.now.weekday()+1) % 7) + + @noarg + def CURRENTMONTH(self): + """The number 01 .. 12 of the current month.""" + return "%02d" % self.now.month + + @noarg + def CURRENTMONTHABBREV(self): + """[MW1.5+] current month abbreviated Jan .. Dec.""" + return self.now.strftime("%b") + + @noarg + def CURRENTMONTHNAME(self): + """current month in named form January .. December. """ + return self.now.strftime("%B") + + @noarg + def CURRENTTIME(self): + """The current time of day (00:00 .. 23:59).""" + return self.now.strftime("%H:%M") + + @noarg + def CURRENTWEEK(self): + """Number of the current week (1-53) according to ISO 8601 with no leading zero.""" + return str(self.now.isocalendar()[1]) + + @noarg + def CURRENTYEAR(self): + """Returns the current year.""" + return str(self.now.year) + + @noarg + def CURRENTTIMESTAMP(self): + """[MW1.7+] Returns the current time stamp. e.g.: 20060528125203""" + return self.now.strftime("%Y%m%d%H%M%S") + + def MONTHNAME(self, args): + rl = args + if not rl: + return u"Missing required parameter 1=month!" + try: + m=int(rl[0].strip()) % 12 + except ValueError: + return u"month should be an integer" + if m==0: + m=12 + + return datetime.datetime(2000, m, 1).strftime("%B") + +class PageMagic(object): + def __init__(self, pagename='', server="http://en.wikipedia.org", revisionid=0): + self.pagename = pagename + self.server = server + self.revisionid = revisionid + + def PAGENAME(self, args): + """Returns the name of the current page, including all levels (Title/Subtitle/Sub-subtitle)""" + return self.pagename + + def PAGENAMEE(self, args): + """same as PAGENAME but More URL-friendly percent encoded + special characters (To use an articlename in an external link). + """ + return urllib.quote(self.pagename.encode('utf8')) + + + def SUBPAGENAME(self, args): + """[MW1.6+] Returns the name of the current page, excluding parent + pages ('Title/Subtitle' becomes 'Subtitle'). + """ + return self.pagename.split('/')[-1] + + def SUBPAGENAMEE(self, args): + return urllib.quote(self.SUBPAGENAMEE()) + + def BASEPAGENAME(self, args): + """[MW1.7+] The basename of a subpage ('Title/Subtitle' becomes 'Title') + """ + return self.pagename.rsplit('/', 1)[0] + + def BASEPAGENAMEE(self, args): + """[MW1.7+] The basename of a subpage ('Title/Subtitle' becomes 'Title') + """ + return urllib.quote(self.BASEPAGENAME(args)) + + def NAMESPACE(self, args): + """Returns the name of the namespace the current page resides in.""" + return u"" # we currently only have articles living in the main/empty namespace + + def NAMESPACEE(self, args): + """Returns the name of the namespace the current page resides in. (quoted)""" + return urllib.quote(self.NAMESPACE(args)) + + def REVISIONID(self, args): + """[MW1.5+] The unique identifying number of a page, see Help:Diff.""" + return str(self.revisionid) + + @noarg + def SITENAME(self): + """Value of $wgSitename.""" + return "" + + def NS(self, args): + """Returns the name of a given namespace number.""" + return "++NS not implemented++" + + def LOCALURL(self, args): + """Returns the local URL of a given page. The page might not exist.""" + try: + url = "/wiki"+ "".join(args) + except: + url = '' # FIXME + return "/wiki"+url + + def LOCALURLE(self, args): + """Returns the local URL of a given page. The page might not exist.""" + return urllib.quote(self.LOCALURL(args)) + + def URLENCODE(self, args): + """[MW1.7+] To use a variable (parameter in a template) with spaces in an external link.""" + try: + url = urllib.quote_plus("".join(args[0])) + except: + url = "".join(args[0]) + return url + + @noarg + def SERVER(self): + """Value of $wgServer""" + return self.server + + def FULLURL(self, args): + return u'' + u = "".join(args) + self.SERVERNAME({}) + + @noarg + def SERVERNAME(self): + return self.SERVER({})[len("http://"):] + + +class NumberMagic(object): + def DISPLAYTITLE(self, args): + """[MW 1.7+] (unclear)""" + return "" + + def NUMBEROFARTICLES(self, args): + """A variable which returns the total number of articles on the Wiki.""" + return "0" + + def NUMBEROFPAGES(self, args): + """[MW1.7+] Returns the total number of pages. """ + return "0" + + def NUMBEROFFILES(self, args): + """[MW1.5+] Returns the number of uploaded files (rows in the image table).""" + return "0" + + def NUMBEROFUSERS(self, args): + """[MW1.7+] Returns the number of registered users (rows in the user table).""" + return "0" + + def CURRENTVERSION(self, args): + """[MW1.7+] Returns the current version of MediaWiki being run. [5]""" + return "1.7alpha" + + + +class StringMagic(object): + @singlearg + def LC(self, a): + return a.lower() + + @singlearg + def UC(self, a): + return a.upper() + + @singlearg + def LCFIRST(self, a): + return a[:1].lower()+a[1:] + + @singlearg + def UCFIRST(self, a): + return a[:1].upper()+a[1:] + + @singlearg + def FORMATNUM(self, a): + return a + +class ParserFunctions(object): + wikidb = None + def _error(self,s): + return '<strong class="error">%s</strong>' % (s,) + + def TAG(self, args): + name = args[0].strip() + r= u"<%s>%s</%s>" % (name, args[1], name) + return r + + + def IF(self, rl): + if rl[0]: + return rl[1] + else: + return rl[2] + + def IFEXIST(self, args): + name = args[0] + if not self.wikidb: + return args.get(args[2], "") + + # wrong place. FIXME. + if ':' in name: + ns, name = name.split(':', 1) + if ns.lower() in ['vorlage', 'template']: + r=self.wikidb.getTemplate(name) + else: + r=None + else: + r=self.wikidb.getRawArticle(name) + + if r: + return args[1] + else: + return args[2] + + + + def IFEQ(self, rl): + if maybe_numeric_compare(rl[0], rl[1]): + return rl[2] + else: + return rl[3] + + def EXPR(self, rl): + if rl: + try: + r=str(expr.expr(rl[0])) + except Exception, err: + return self._error(err) + + if "e" in r: + f,i = r.split("e") + i=int(i) + if i<0: + sign = '' + else: + sign = '+' + fixed=str(float(f))+"E"+sign+str(int(i)) + return fixed + return r + return u"0" + + + def IFEXPR(self, rl): + try: + r = expr.expr(rl[0]) + except Exception, err: + return self._error(err) + + if r: + return rl[1] + else: + return rl[2] + + def SWITCH(self, args): + """see http://meta.wikimedia.org/wiki/ParserFunctions#.23switch:""" + cmpval = args[0].strip() + found=False # used for fall through + for c in args[1:]: + if '=' in c: + val, result = c.split('=', 1) + val=val.strip() + result=result.strip() + if found or maybe_numeric_compare(val, cmpval): + return result + else: + if maybe_numeric_compare(cmpval,c.strip()): + found=True + + d=args["#default"] + if d: + return d + + + last = args[-1] + + if '=' not in last: + return last + return u'' + + def TITLEPARTS(self, args): + title = args[0] + try: + numseg = int(args[1]) + except ValueError: + numseq = 0 + + try: + start = int(args[2]) + except ValueError: + start = 1 + + if start>0: + start -= 1 + + parts = title.split("/")[start:] + if numseg: + parts = parts[:numseg] + return "/".join(parts) + + def IFERROR(self, args): + errmark = '<strong class="error">' + val = args[0] + bad=args[1] + good=args[2] or val + + if errmark in val: + return bad + else: + return good + + +for x in dir(ParserFunctions): + if x.startswith("_"): + continue + setattr(ParserFunctions, "#"+x, getattr(ParserFunctions, x)) + delattr(ParserFunctions, x) + +class DummyResolver(object): + pass + +class MagicResolver(TimeMagic, PageMagic, NumberMagic, StringMagic, ParserFunctions, OtherMagic, DummyResolver): + def __call__(self, name, args): + try: + name = str(name) + except UnicodeEncodeError: + return None + + + m = getattr(self, name.upper(), None) + if m is None: + return None + + if isinstance(m, basestring): + return m + + res = m(args) or '' # FIXME: catch TypeErros + assert isinstance(res, basestring), "MAGIC %r returned %r" % (name, res) + return res + + def has_magic(self, name): + try: + name = str(name) + except UnicodeEncodeError: + return False + + + m = getattr(self, name.upper(), None) + return m is not None + + + +magic_words = ['basepagename', 'basepagenamee', 'contentlanguage', 'currentday', 'currentday2', 'currentdayname', 'currentdow', 'currenthour', 'currentmonth', 'currentmonthabbrev', 'currentmonthname', 'currentmonthnamegen', 'currenttime', 'currenttimestamp', 'currentversion', 'currentweek', 'currentyear', 'defaultsort', 'directionmark', 'displaytitle', 'fullpagename', 'fullpagenamee', 'language', 'localday', 'localday2', 'localdayname', 'localdow', 'localhour', 'localmonth', 'localmonthabbrev', 'localmonthname', 'localmonthnamegen', 'localtime', 'localtimestamp', 'localweek', 'localyear', 'namespace', 'namespacee', 'newsectionlink', 'numberofadmins', 'numberofarticles', 'numberofedits', 'numberoffiles', 'numberofpages', 'numberofusers', 'pagename', 'pagenamee', 'pagesinnamespace', 'revisionday', 'revisionday2', 'revisionid', 'revisionmonth', 'revisiontimestamp', 'revisionyear', 'scriptpath', 'server', 'servername', 'sitename', 'subjectpagename', 'subjectpagenamee', 'subjectspace', 'subjectspacee', 'subpagename', 'subpagenamee', 'talkpagename', 'talkpagenamee', 'talkspace', 'talkspacee', 'urlencode'] + +def _populate_dummy(): + m=MagicResolver() + + def get_dummy(name): + def resolve(*args): + log.warn("using dummy resolver for %s" % (name,)) + return u"" + return resolve + + missing = set() + for x in magic_words: + if not m.has_magic(x): + missing.add(x) + setattr(DummyResolver, x.upper(), get_dummy(x)) + + if missing: + missing = list(missing) + missing.sort() + #log.info("installed dummy resolvers for %s" % (", ".join(missing),)) + +_populate_dummy() diff --git a/mwlib/metabook.py b/mwlib/metabook.py new file mode 100755 index 0000000..e36e70d --- /dev/null +++ b/mwlib/metabook.py @@ -0,0 +1,119 @@ +#! /usr/bin/env python +#! -*- coding:utf-8 -*- + +import re +import simplejson + +""" +See METABOOK.txt for description of Metabook data +""" + +class MetaBook(object): + """Encapsulate meta information about an article collection""" + + title = u"" + subtitle = u"" + + def __init__(self): + self.type = 'collection' + self.version = 1 + self.items = [] + + def addArticles(self, articleTitles, chapterTitle=None, contentType='text/x-wiki'): + """ + @param articleTitles: sequence of article titles or dicts containing + article title (value for key 'title') and optionally display title + (value for key 'displaytitle'). + @type articleTitles: [unicode|{str: unicode}] + """ + + articles = [] + for title in articleTitles: + article = { + 'type': 'article', + 'content-type': contentType, + } + if isinstance(title, dict): + article.update(title) + else: + article['title'] = title + articles.append(article) + if chapterTitle: + self.items.append({ + 'type': 'chapter', + 'title': chapterTitle, + 'items': articles, + }) + else: + self.items.extend(articles) + + def dumpJson(self): + return simplejson.dumps(vars(self)) + + def loadJson(self, jsonStr): + for (var, value) in simplejson.loads(jsonStr).items(): + setattr(self, var, value) + + def readJsonFile(self, filename): + self.loadJson(open(filename, 'rb').read()) + + def loadCollectionPage(self, mwcollection): + """Parse wikitext of a MediaWiki collection page + + @param mwcollection: wikitext of a MediaWiki collection page as created by + the Collection extension for MediaWiki + @type mwcollection: unicode + """ + + titleRe = '^==\s+(?P<title>.*?)\s+==$' + subtitleRe = '^===\s+(?P<subtitle>.*?)\s+===$' + chapterRe = '^;(?P<chapter>.*?)$' + articleRe = '^:\[\[:?(?P<article>.*?)(?:\|(?P<displaytitle>.*?))?\]\]$' + alltogetherRe = re.compile("(%s)|(%s)|(%s)|(%s)" % (titleRe, subtitleRe, chapterRe, articleRe)) + gotChapter = False + chapter = '' + articles = [] + for line in mwcollection.splitlines(): + res = alltogetherRe.search(line.strip()) + if not res: + continue + if res.group('title'): + self.title = res.group('title') + elif res.group('subtitle'): + self.subtitle = res.group('subtitle') + elif res.group('chapter'): + self.addArticles(articles, chapter) + articles = [] + chapter = res.group('chapter') + elif res.group('article'): + d = {'title': res.group('article')} + if res.group('displaytitle'): + d['displaytitle'] = res.group('displaytitle') + articles.append(d) + + if len(articles): + self.addArticles(articles, chapter) + + def getArticles(self): + """Generator that produces a sequence of (title, revision) pairs for + each article contained in this collection. If no revision is specified, + None is returned for the revision item. + """ + + for item in self.getItems(): + if item['type'] == 'article': + yield item['title'], item.get('revision', None) + + def getItems(self): + """Generator that produces a flattened list of chapters and articles + in this collection. + """ + + for item in self.items: + if item['type'] == 'article': + yield item + elif item['type'] == 'chapter': + yield item + for article in item.get('items', []): + yield article + diff --git a/mwlib/mwapidb.py b/mwlib/mwapidb.py new file mode 100644 index 0000000..4826ef4 --- /dev/null +++ b/mwlib/mwapidb.py @@ -0,0 +1,376 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright (c) 2008, PediaPress GmbH +# See README.txt for additional licensing information. + +import os +import re +import shutil +import tempfile +import time +import urllib +import urllib2 +import urlparse + +import simplejson + +from mwlib import uparser, utils +from mwlib.log import Log + +log = Log("mwapidb") + +try: + from mwlib.licenses import lower2normal +except ImportError: + log.warn('no licenses found') + lower2normal = {} + +# ============================================================================== + + +def fetch_url(url, ignore_errors=False): + log.info("fetching %r" % (url,)) + opener = urllib2.build_opener() + opener.addheaders = [('User-agent', 'mwlib')] + try: + data = opener.open(url).read() + except urllib2.URLError, err: + if ignore_errors: + log.error("%s - while fetching %r" % (err, url)) + return None + raise RuntimeError('Could not fetch %r: %s' % (url, err)) + log.info("got %r (%d Bytes)" % (url, len(data))) + return data + + +# ============================================================================== + + +class APIHelper(object): + def __init__(self, base_url): + """ + @param base_url: base URL (or list of URLs) of a MediaWiki, + i.e. URL path to php scripts, + e.g. 'http://en.wikipedia.org/w/' for English Wikipedia. + @type base_url: basestring or [basestring] + """ + + if isinstance(base_url, unicode): + self.base_url = base_url.encode('utf-8') + else: + self.base_url = base_url + if self.base_url[-1] != '/': + self.base_url += '/' + + def query(self, **kwargs): + args = { + 'action': 'query', + 'format': 'json', + } + args.update(**kwargs) + for k, v in args.items(): + if isinstance(v, unicode): + args[k] = v.encode('utf-8') + data = fetch_url('%sapi.php?%s' % (self.base_url, urllib.urlencode(args))) + if data is None: + return None + try: + return simplejson.loads(unicode(data, 'utf-8'))['query'] + except KeyError: + return None + except: + raise RuntimeError('api.php query failed. Are you sure you specified the correct baseurl?') + + def page_query(self, **kwargs): + q = self.query(**kwargs) + if q is None: + return None + try: + page = q['pages'].values()[0] + except (KeyError, IndexError): + return None + if 'missing' in page: + return None + return page + + +# ============================================================================== + + +class ImageDB(object): + def __init__(self, base_url, shared_base_url=None): + self.api_helpers = [APIHelper(base_url)] + if shared_base_url is not None: + self.api_helpers.append(APIHelper(shared_base_url)) + self.tmpdir = tempfile.mkdtemp() + + def clear(self): + shutil.rmtree(self.tmpdir, ignore_errors=True) + + def getURL(self, name, size=None): + """Return image URL for image with given name + + @param name: image name (without namespace, i.e. without 'Image:') + @type name: unicode + + @returns: URL to original image + @rtype: str + """ + + assert isinstance(name, unicode), 'name must be of type unicode' + + for api_helper in self.api_helpers: + if size is None: + result = api_helper.page_query(titles='Image:%s' % name, prop='imageinfo', iiprop='url') + else: + result = api_helper.page_query(titles='Image:%s' % name, prop='imageinfo', iiprop='url', iiurlwidth=str(size)) + if result is not None: + break + else: + return None + + try: + imageinfo = result['imageinfo'][0] + if size is not None and 'thumburl' in imageinfo: + url = imageinfo['thumburl'] + else: + url = imageinfo['url'] + if url: # url can be False + if url.startswith('/'): + url = urlparse.urljoin(self.api_helpers[0].base_url, url) + return url + return None + except (KeyError, IndexError): + return None + + def getDiskPath(self, name, size=None): + """Return filename for image with given name and size + + @param name: image name (without namespace, i.e. without 'Image:') + @type name: unicode + + @param size: if given, the image is converted to the given maximum width + @type size: int or NoneType + + @returns: filename of image or None if image could not be found + @rtype: basestring + """ + + assert isinstance(name, unicode), 'name must be of type unicode' + + url = self.getURL(name, size=size) + if url is None: + return None + + data = fetch_url(url, ignore_errors=True) + if not data: + return None + + ext = url.rsplit('.')[-1] + if size is not None: + ext = '%dpx.%s' % (size, ext) + else: + ext = '.%s' % ext + filename = os.path.join(self.tmpdir, utils.fsescape(name + ext)) + f = open(filename, 'wb') + f.write(data) + f.close() + return filename + + def getLicense(self, name): + """Return license of image as stated on image description page + + @param name: image name without namespace (e.g. without "Image:") + @type name: unicode + + @returns: license of image of None, if no valid license could be found + @rtype: unicode + """ + + assert isinstance(name, unicode), 'name must be of type unicode' + + for api_helper in self.api_helpers: + result = api_helper.page_query(titles='Image:%s' % name, prop='templates') + if result is not None: + break + else: + return None + + try: + templates = [t['title'] for t in result['templates']] + except KeyError: + return None + + for t in templates: + try: + return lower2normal[t.split(':', 1)[-1].lower()] + except KeyError: + pass + + return None + + +# ============================================================================== + + +class WikiDB(object): + print_template = u'Template:Print%s' + + ip_rex = re.compile(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$') + bot_rex = re.compile(r'\bbot\b', re.IGNORECASE) + + def __init__(self, base_url, license, template_blacklist=None): + """ + @param base_url: base URL of a MediaWiki, + e.g. 'http://en.wikipedia.org/w/' + @type base_url: basestring + + @param license: title of an article containing full license text + @type license: unicode + + @param template_blacklist: title of an article containing blacklisted + templates (optional) + @type template_blacklist: unicode + """ + + self.base_url = base_url + self.license = license + self.api_helper = APIHelper(self.base_url) + self.template_cache = {} + self.template_blacklist = [] + if template_blacklist is not None: + raw = self.getRawArticle(template_blacklist) + if raw is None: + log.error('Could not get template blacklist article %r' % template_blacklist) + else: + self.template_blacklist = [template.lower().strip() + for template in re.findall('\* *\[\[.*?:(.*?)\]\]', raw)] + + def getURL(self, title, revision=None): + name = urllib.quote(title.replace(" ", "_").encode('utf-8')) + if revision is None: + return '%sindex.php?title=%s' % (self.base_url, name) + else: + return '%sindex.php?title=%s&oldid=%s' % (self.base_url, name, revision) + + def getAuthors(self, title, revision=None, max_num_authors=10): + """Return at most max_num_authors names of non-bot, non-anon users for + non-minor changes of given article (before given revsion). + + @returns: list of principal authors + @rtype: [unicode] + """ + + result = self.api_helper.page_query( + titles=title, + redirects=1, + prop='revisions', + rvprop='user|ids|flags|comment', + rvlimit=500, + ) + if result is None: + return None + + try: + revs = result['revisions'] + except KeyError: + return None + + if revision is not None: + revision = int(revision) + revs = [r for r in revs if r['revid'] < revision] + + authors = [r['user'] for r in revs + if not r.get('anon') + and not self.ip_rex.match(r['user']) + and not r.get('minor') + and not self.bot_rex.search(r.get('comment', '')) + and not self.bot_rex.search(r['user']) + ] + author2count = {} + for a in authors: + try: + author2count[a] += 1 + except KeyError: + author2count[a] = 1 + author2count = author2count.items() + author2count.sort(key=lambda a: -a[1]) + return [a[0] for a in author2count[:max_num_authors]] + + def getTemplate(self, name, followRedirects=True): + """ + Note: *Not* following redirects is unsupported! + """ + + try: + return self.template_cache[name] + except KeyError: + pass + + if ":" in name: + name = name.split(':', 1)[1] + + if name.lower() in self.template_blacklist: + log.info("ignoring blacklisted template:" , repr(name)) + return None + + for title in (self.print_template % name, 'Template:%s' % name): + log.info("Trying template %r" % (title,)) + c = self.getRawArticle(title) + if c is not None: + self.template_cache[name] = c + return c + + return None + + def getRawArticle(self, title, revision=None): + if revision is None: + page = self.api_helper.page_query(titles=title, redirects=1, prop='revisions', rvprop='content') + else: + page = self.api_helper.page_query(revids=revision, prop='revisions', rvprop='content') + if page['title'] != title: # given revision could point to another article! + return None + if page is None: + return None + try: + return page['revisions'][0].values()[0] + except KeyError: + return None + + def getMetaData(self): + result = self.api_helper.query(meta='siteinfo') + try: + g = result['general'] + return { + 'license': { + 'name': g['rights'], + 'wikitext': self.getRawArticle(self.license), + }, + 'url': g['base'], + 'name': '%s (%s)' % (g['sitename'], g['lang']), + } + except KeyError: + return None + + def getParsedArticle(self, title, revision=None): + raw = self.getRawArticle(title, revision=revision) + if raw is None: + return None + a = uparser.parseString(title=title, raw=raw, wikidb=self) + return a + + +class Overlay(WikiDB): + def __init__(self, wikidb, templates): + self.__dict__.update(wikidb.__dict__) + self.overlay_templates = templates + + def getTemplate(self, name, followRedirects=False): + try: + return self.overlay_templates[name] + except KeyError: + pass + + return super(Overlay, self).getTemplate(name, followRedirects=followRedirects) + diff --git a/mwlib/mwscan.py b/mwlib/mwscan.py new file mode 100755 index 0000000..100ea35 --- /dev/null +++ b/mwlib/mwscan.py @@ -0,0 +1,315 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import sys +import time +import _mwscan +import htmlentitydefs + +class token(object): + t_end = 0 + t_text = 1 + t_entity = 2 + t_special = 3 + t_magicword = 4 + t_comment = 5 + t_2box_open = 6 + t_2box_close = 7 + t_http_url = 8 + t_break = 9 + t_begin_table = 10 + t_end_table = 11 + t_html_tag = 12 + t_style = 13 + t_pre = 14 + t_section = 15 + t_section_end = 16 + t_item = 17 + t_colon = 18 + t_semicolon = 19 + t_hrule = 20 + t_newline = 21 + t_column = 22 + t_row = 23 + t_tablecaption = 24 + t_urllink = 25 + + token2name = {} + +for d in dir(token): + token2name = token.token2name + if d.startswith("t_"): + token2name[getattr(token, d)] = d +del d + + + + +def dump_tokens(text, tokens): + for type, start, len in tokens: + print type, repr(text[start:start+len]) + +def scan(text): + stime=time.time() + text += u"\0"*32 + tokens = _mwscan.scan(text) + return scan_result(text, tokens) + +def resolve_entity(e): + if e[1]=='#': + if e[2]=='x' or e[2]=='X': + return unichr(int(e[3:-1], 16)) + else: + return unichr(int(e[2:-1])) + + else: + try: + return unichr(htmlentitydefs.name2codepoint[e[1:-1]]) + except KeyError: + return e + + +class scan_result(object): + def __init__(self, source, toks): + self.source = source + self.toks = toks + + def rawtext(self, (type, start, tlen)): + return self.source[start:start+tlen] + + def text(self, t): + r=self.rawtext(t) + if t[0] == token.t_entity: + return resolve_entity(r) + else: + return r + + def dump(self, out=None): + if out is None: + out = sys.stdout + for x in self: + out.write("%s\n" % self.repr(x)) + + + + def repr(self, t): + return "(%s, %r)" % (token.token2name.get(t[0]), self.rawtext(t)) + + + def __len__(self): + return len(self.toks) + + def __iter__(self): + return iter(self.toks) + + def __getitem__(self, idx): + return self.toks[idx] + + +class _compat_scanner(object): + class ignore: pass + tok2compat = { + token.t_text: "TEXT", + token.t_special: "SPECIAL", + token.t_2box_open: "[[", + token.t_2box_close: "]]", + token.t_http_url: "URL", + token.t_break: "BREAK", + token.t_style: "STYLE", + token.t_pre: "PRE", + token.t_section: "SECTION", + token.t_section_end: "ENDSECTION", + token.t_magicword: ignore, + token.t_comment: ignore, + token.t_end: ignore, + token.t_item: "ITEM", + token.t_colon: "EOLSTYLE", + token.t_semicolon: "EOLSTYLE", + token.t_newline: "\n", + token.t_begin_table: "BEGINTABLE", + token.t_end_table: "ENDTABLE", + token.t_column: "COLUMN", + token.t_row: "ROW", + token.t_tablecaption: "TABLECAPTION", + token.t_urllink: "URLLINK", + } + + + def __call__(self, text): + tokens = scan(text) + scanres = scan_result(text, tokens) + + + res = [] + + def g(): + return text[start:start+tlen] + a = lambda x: res.append((x,g())) + + + ignore = self.ignore + tok2compat = self.tok2compat + + i = 0 + numtokens = len(tokens) + while i < numtokens: + type, start, tlen = tokens[i] + n=tok2compat.get(type) + if n is ignore: + pass + elif n is not None: + a(n) + elif type==token.t_entity: + res.append(("TEXT", resolve_entity(g()))) + elif type==token.t_hrule: + res.append((self.tagtoken("<hr />"), g())) + elif type==token.t_html_tag: + s = g() + + tt = self.tagtoken(s) + isEndToken = isinstance(tt, EndTagToken) + closingOrSelfClosing = isEndToken or tt.selfClosing + + if tt.t=="math": + if closingOrSelfClosing: + i+=1 + continue + + res.append(("MATH", g())) + i+=1 + while i<numtokens: + type, start, tlen = tokens[i] + if type==token.t_html_tag: + tt = self.tagtoken(g()) + if tt.t=="math": + res.append(("ENDMATH", g())) + break + res.append(("LATEX", g())) + i+=1 + elif tt.t=="timeline": + if closingOrSelfClosing: + i+=1 + continue + res.append(("TIMELINE", g())) + i+=1 + while i<numtokens: + type, start, tlen = tokens[i] + if type==token.t_html_tag: + tt = self.tagtoken(g()) + if tt.t=="timeline": + res.append(("TIMELINE", g())) + break + res.append(("TEXT", g())) + i+=1 + elif tt.t=="nowiki": + i+=1 + if isEndToken or tt.selfClosing: + continue + while i<numtokens: + type, start, tlen = tokens[i] + if type==token.t_html_tag: + tt = self.tagtoken(g()) + if tt.t=="nowiki": + break + res.append(("TEXT", scanres.text((type, start, tlen)))) + i+=1 + elif tt.t in ["font", "noinclude", 'p', 'caption']: + pass + elif tt.t=="table": + if isEndToken: + res.append(("ENDTABLE", g())) + else: + res.append(("BEGINTABLE", g())) + elif tt.t in ['th', 'td']: + if isEndToken: + pass + else: + res.append(("COLUMN", g())) + elif tt.t=='tr': + if isEndToken: + pass + else: + res.append(("ROW", g())) + else: + res.append((tt, s)) + else: + a(type) + i+=1 + + + return res + + def tagtoken(self, text): + selfClosing = False + if text.startswith(u"</"): + name = text[2:-1] + klass = EndTagToken + isEndToken = True + elif text.endswith("/>"): + name = text[1:-2] + klass = TagToken + selfClosing = True + isEndToken = False # ??? + else: + name = text[1:-1] + klass = TagToken + isEndToken = False + + name, values = (name.split(None, 1)+[u''])[:2] + from mwlib.parser import paramrx + values = dict(paramrx.findall(values)) + name = name.lower() + + if name=='br' or name=='references': + isEndToken = False + klass = TagToken + + r = klass(name, text) + r.selfClosing = selfClosing + r.values = values + return r + + + +compat_scan = _compat_scanner() + +# from plexscanner import _BaseTagToken, TagToken, EndTagToken + +class _BaseTagToken(object): + def __eq__(self, other): + if isinstance(other, basestring): + return self.t == other + if isinstance(other, self.__class__): + return self.t == other.t + return False + + def __ne__(self, other): + return not(self==other) + + def __hash__(self): + return hash(self.t) + +class TagToken(_BaseTagToken): + values = {} + selfClosing=False + + def __init__(self, t, text=''): + self.t = t + self.text = text + + def __repr__(self): + return "<Tag:%s %r>" % (self.t, self.text) + +class EndTagToken(_BaseTagToken): + def __init__(self, t, text=''): + self.t = t + self.text = text + + def __repr__(self): + return "<EndTag:%s>" % self.t + +def tokenize(input, name="unknown"): + assert input is not None, "must specify input argument in tokenize" + return compat_scan(input) diff --git a/mwlib/netdb.py b/mwlib/netdb.py new file mode 100755 index 0000000..ec38ee0 --- /dev/null +++ b/mwlib/netdb.py @@ -0,0 +1,529 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +# Copyright (c) 2008, PediaPress GmbH +# See README.txt for additional licensing information. + +# An alternative solution to construct the hashpath of images would be to use +# api.php, e.g. +# fetch the page http://de.wikipedia.org/w/api.php?action=query&titles=Bild:SomePic.jpg&prop=imageinfo&iiprop=url&format=json + +import os +import sys +import urllib +import urllib2 +try: + from hashlib import md5 +except ImportError: + from md5 import md5 +import shutil +import sys +import time +import tempfile +import re + +from mwlib import uparser, utils +from mwlib.log import Log + +log = Log("netdb") + +# ============================================================================== + +def hashpath(name): + """Compute hashpath for an image in the same way as MediaWiki does + + @param name: name of an image + @type name: unicode + + @returns: hashpath to image + @type: str + """ + + name = name.replace(' ', '_') + name = name[:1].upper()+name[1:] + d = md5(name.encode('utf-8')).hexdigest() + return "/".join([d[0], d[:2], name]) + +class ImageDB(object): + convert_command = 'convert' # name of/path to ImageMagick's convert tool + + def __init__(self, baseurl, cachedir=None, wikidb=None, knownLicenses=None): + """Init ImageDB with a base URL (or a list of base URLs) and optionally + with a cache directory. + + @param baseurl: base URL or sequence containing several base URLs + @type baseurl: unicode or (unicode,) + + @param cachedir: image cache directory (optional) + @type cachedir: basestring or None + + @param wikidb: WikiDB instance used to fetch image description pages to + find out image licenses + @type wikidb: object + + @param knownLicenses: list of known license templates (whose name is the + name of the license) which may appear on image description pages + @type knownLicenses: [unicode] + """ + + if isinstance(baseurl, unicode): + self.baseurls = [baseurl.encode('ascii')] + else: + self.baseurls = [] + for bu in baseurl: + if isinstance(bu, unicode): + bu = bu.encode('ascii') + self.baseurls.append(bu) + + if cachedir: + self.cachedir = cachedir + self.tempcache = False + else: + self.cachedir = tempfile.mkdtemp() + self.tempcache = True + if self.cachedir[-1] != '/': + self.cachedir += '/' # needed for getPath() to work correctly + + self.wikidb = wikidb + + oredLicenses = '|'.join(['(%s)' % re.escape(license) + for license in (knownLicenses or [])]) + self.licenseRegexp = re.compile(r'{{(?P<license>%s)}}' % oredLicenses) + + self.name2license = {} + + def clear(self): + """Delete temporary cache directory (i.e. only if no cachedir has been + passed to __init__(). + """ + + if self.tempcache: + shutil.rmtree(self.cachedir) + + def getURL(self, name, size=None): + """Return image URL for image with given name + + @param name: image name (without namespace, i.e. without 'Image:') + @type name: unicode + + @returns: URL to original image + @rtype: str + """ + + assert isinstance(name, unicode), 'name must be of type unicode' + + # use getDiskPath() to fetch and cache (!) image + path = self.getDiskPath(name, size=size) + if path is None: + return None + + # first, look for a cached image with that name (in any size) + for baseurl in self.baseurls: + urldir = self._getCacheDirForBaseURL(baseurl) + if not path.startswith(urldir): + continue + return self._getImageURLForBaseURL(baseurl, name) + + def getPath(self, name, size=None): + """Return path to image with given parameters relative to cachedir""" + + path = self.getDiskPath(name, size=size) + if path is None: + return None + assert path.startswith(self.cachedir), 'invalid path from getDiskPath()' + return path[len(self.cachedir):] + + def getDiskPath(self, name, size=None): + """Return filename for image with given name. If the image is not found + in the cache, it is fetched per HTTP and converted. + + @param name: image name (without namespace, i.e. without 'Image:') + @type name: unicode + + @param size: if given, the image is converted to the given maximum size + (i.e. the image is scaled so that neither its width nor its height + exceed size) + @type size: int or NoneType + + @returns: filename of image + @rtype: basestring + """ + + assert isinstance(name, unicode), 'name must be of type unicode' + + path = self._getImageFromCache(name, size=size) + if path: + return path + + tmpfile, baseurl = self._fetchImage(name) + if tmpfile is None: + return None + + self.name2license[name] = self._fetchLicense(baseurl, name) + + path = self._convertToCache(tmpfile, baseurl, name, size=size) + + try: + os.unlink(tmpfile) + except IOError: + log.warn('Could not delete temp file %r' % tmpfile) + + return path + + def _fetchLicense(self, baseurl, name): + if self.wikidb is None: + return None + + raw = self.wikidb.getImageDescription(name, + urlIndex=self.baseurls.index(baseurl), + ) + if raw is None: + return None + + mo = re.search(self.licenseRegexp, raw) + if mo is None: + return None + + return mo.group('license') + + def getLicense(self, name): + """Return license of image as stated on image description page + + @param name: image name without namespace (e.g. without "Image:") + @type name: unicode + + @returns: license of image of None, if no valid license could be found + @rtype: str + """ + + assert isinstance(name, unicode), 'name must be of type unicode' + return self.name2license.get(name) + + def _getImageFromCache(self, name, size=None): + """Look in cachedir for an image with the given parameters""" + + for baseurl in self.baseurls: + path = self._getCachedImagePath(baseurl, name, size=size) + if path is not None and os.path.exists(path): + return path + return None + + def _getCacheDirForBaseURL(self, baseurl): + """Construct the path of the cache directory for the given base URL. + This directory doesn't need to exist. + """ + + return os.path.join(self.cachedir, + md5(baseurl.encode('utf-8')).hexdigest()[:8]) + + def _getCachedImagePath(self, baseurl, name, size=None, makedirs=False): + """Construct a filename for an image with the given parameters inside + the cache directory. The file doesn't need to exist. If makedirs is True + create all directories up to filename. + """ + + urlpart = self._getCacheDirForBaseURL(baseurl) + if size is not None: + sizepart = '%dpx' % size + else: + sizepart = 'orig' + + if name.lower().endswith('.svg'): + if size is None: + log.warn('Cannot get SVG image when no size is given') + return None + name += '.png' + if name.lower().endswith('.gif'): + name += '.png' + name = (name[0].upper() + name[1:]).replace(' ', '_').replace("'", "-") + + d = os.path.join(urlpart, sizepart) + if makedirs and not os.path.isdir(d): + os.makedirs(d) + return os.path.join(d, utils.fsescape(name)) + + def _fetchImage(self, name): + """Fetch image with given name in original (i.e. biggest) size per HTTP. + + @returns: filename of written image and base URL used to retrieve the + image or (None, None) if the image could not be fetched + @rtype: (basestring, str) or (NoneType, NoneType) + """ + + for baseurl in self.baseurls: + path = self._fetchImageFromBaseURL(baseurl, name) + if path: + return path, baseurl + return None, None + + def _getImageURLForBaseURL(self, baseurl, name): + """Construct a URL for the image with given name under given base URL""" + + hp = hashpath(name).encode('utf-8') + return urllib.basejoin(baseurl, urllib.quote(hp)) + + def _fetchImageFromBaseURL(self, baseurl, name): + """Fetch image with given name under given baseurl and write it to a + tempfile. + + @returns: filename of written image or None if image could not be fetched + @rtype: basestring or NoneType + """ + + url = self._getImageURLForBaseURL(baseurl, name) + log.info("fetching %r" % (url,)) + opener = urllib2.build_opener() + opener.addheaders = [('User-agent', 'mwlib')] + try: + data = opener.open(url).read() + log.info("got image: %r" % url) + fd, filename = tempfile.mkstemp() + os.write(fd, data) + os.close(fd) + return filename + except urllib2.URLError, err: + log.error("%s - while fetching %r" % (err, url)) + return None + + def _convertToCache(self, srcfile, baseurl, name, size=None): + """Convert image in file named srcfile to have the given maximum size. + Save the converted image in the cache directory for the given baseurl. + + @returns: filename of converted image + @rtype: basestring + """ + destfile = self._getCachedImagePath(baseurl, name, size=size, makedirs=True) + if size is not None: + thumbnail = '-thumbnail "%dx%d>"' % (size, size) + else: + thumbnail = '-strip' + + opts = '-background white -density 100 -flatten -coalesce %(thumbnail)s' % { + 'thumbnail': thumbnail, + } + cmd = "%(convert)s %(opts)s '%(src)s[0]' '%(dest)s'" % { + 'convert': self.convert_command, + 'opts': opts, + 'src': srcfile, + 'dest': destfile, + } + log.info('executing %r' % cmd) + rc = utils.shell_exec(cmd) + if rc != 0: + log.error('Could not convert %r: convert returned %d' % (name, rc)) + return None + + return destfile + + +# ============================================================================== + +def normname(name): + name = name.strip().replace("_", " ") + name = name[:1].upper()+name[1:] + return name + + +class NetDB(object): + redirect_rex = re.compile(r'^#Redirect:?\s*?\[\[(?P<redirect>.*?)\]\]', re.IGNORECASE) + + def __init__(self, pagename, + imagedescriptionurls=None, + templateurls=None, + templateblacklist=None, + defaultauthors=None, + ): + """ + @param pagename: URL to page in wikitext format. @TITLE@Â gets replaced + with the page name and @REVISION@Â gets replaced with the requested + revision/oldid. E.g. + + "http://mw/index.php?title=@TITLE@&action=raw&oldid=@TITLE@" + + @type pagename: str + + @param imagedescriptionurls: list of URLs to image description pages in + wikitext format. @TITLE@ gets replaced with the image title w/out + its prefix. E.g. + + ["http://mw/index.php?title=Image:@TITLE@s&action=raw"] + + The list must be of the same length as the baseurl list of the + accompanying ImageDB, and the URL with the corresponding position + in the list is used to retrieve the description page. + @type imagedescriptionurls: [str] + + @param templateurls: list of URLs to template pages in wikitext format. + @TITLE@ gets replaced with the template title. E.g. + + ["http://mw/index.php?title=Template:@TITLE@s&action=raw"] + + If more than one URL is specified, URLs are tried in given order. + @type templateurls: [str] + + @param defaultauthors: list of default (principal) authors for articles + @type defaultauthors: [unicode] + """ + + self.pagename = pagename.replace("%", "%%").replace("@TITLE@", "%(NAME)s").replace("@REVISION@", "%(REVISION)s") + + if templateurls is None: + templateurls = [] + self.templateurls = [x.replace("%", "%%").replace("@TITLE@", "%(NAME)s") + for x in templateurls] + + if imagedescriptionurls is None: + imagedescriptionurls = [] + self.imagedescriptionurls = [x.replace("%", "%%").replace("@TITLE@", "%(NAME)s") + for x in imagedescriptionurls] + + if templateblacklist: + self.templateblacklist = self._readTemplateBlacklist(templateblacklist) + else: + self.templateblacklist = [] + + if defaultauthors: + self.defaultauthors = defaultauthors + else: + self.defaultauthors = [] + + self.pages = {} + + def _getpage(self, url, expectedContentType='text/x-wiki'): + try: + return self.pages[url] + except KeyError: + pass + + stime=time.time() + response = urllib.urlopen(url) + data = response.read() + log.info('fetched %r in %ss' % (url, time.time()-stime)) + + if expectedContentType: + ct = response.info().gettype() + if ct != expectedContentType: + log.warn('Skipping page %r with content-type %r (%r was expected). Skipping.'\ + % (url, ct, expectedContentType)) + return None + + self.pages[url] = data + return data + + def _readTemplateBlacklist(self,templateblacklist): + if not templateblacklist: + return [] + try: + content = urllib.urlopen(templateblacklist).read() + return [template.lower().strip() for template in re.findall('\* *\[\[.*?:(.*?)\]\]', content)] + except: # fixme: more sensible error handling... + log.error('Error fetching template blacklist from url:', templateblacklist) + return [] + + def _dummy(self, *args, **kwargs): + pass + + startCache = _dummy + + def getURL(self, title, revision=None): + name = urllib.quote(title.replace(" ", "_").encode('utf8')) + if revision is None: + return self.pagename % dict(NAME=name, REVISION='0') + else: + return self.pagename % dict(NAME=name, REVISION=revision) + + def getAuthors(self, title, revision=None): + return list(self.defaultauthors) + + def title2db(self, title): + assert isinstance(title, unicode), 'title must be of type unicode' + return title.encode('utf-8') + + def db2title(self, dbtitle): + assert isinstance(dbtitle, str), 'dbtitle must be of type str' + return unicode(dbtitle, 'utf-8') + + def getImageDescription(self, title, urlIndex=0): + """Fetch the image description page for the image with the given title. + If baseurl and self.imagedescriptions contains more than one URL, use + the one which starts with baseurl. + + @param title: title of the image w/out prefix (like Image:) + @type title: unicode + + @param urlIndex: index for imagedescriptionurls + @type urlIndex: int + + @returns: wikitext of image description page or None + @rtype: unicode or None + """ + + if not self.imagedescriptionurls: + return None + + raw = self._getpage(self.imagedescriptionurls[urlIndex] % { + 'NAME': urllib.quote(title.replace(" ", "_").encode('utf8')), + }) + if raw is None: + return None + + return unicode(raw, 'utf-8') + + def getTemplate(self, name, followRedirects=False): + if ":" in name: + name = name.split(':', 1)[1] + + + if name.lower() in self.templateblacklist: + log.info("ignoring blacklisted template:" , repr(name)) + return None + name = urllib.quote(name.replace(" ", "_").encode('utf8')) + for u in self.templateurls: + url = u % dict(NAME=name) + log.info("Trying %r" %(url,)) + c=self._getpage(url) + if c: + log.info("got content from", url) + res=unicode(c, 'utf8') + mo = self.redirect_rex.search(res) + if mo: + redirect = mo.group('redirect') + redirect = normname(redirect.split("|", 1)[0].split("#", 1)[0]) + return self.getTemplate(redirect) + return res + + + + + #return self.getRawArticle(u'Template:%s' % name) + + def getRawArticle(self, title, revision=None): + r = self._getpage(self.getURL(title, revision=revision)) + if r is None: + return None + return unicode(r, 'utf8') + + def getRedirect(self, title): + return u"" + + def getParsedArticle(self, title, revision=None): + raw = self.getRawArticle(title, revision=revision) + if raw is None: + return None + a = uparser.parseString(title=title, raw=raw, wikidb=self) + return a + + +class Overlay(NetDB): + def __init__(self, wikidb, templates): + self.__dict__.update(wikidb.__dict__) + self.overlay_templates = templates + + def getTemplate(self, name, followRedirects=False): + try: + return self.overlay_templates[name] + except KeyError: + pass + + return super(Overlay, self).getTemplate(name, followRedirects=followRedirects) + diff --git a/mwlib/overlay.py b/mwlib/overlay.py new file mode 100644 index 0000000..68c4774 --- /dev/null +++ b/mwlib/overlay.py @@ -0,0 +1,22 @@ + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import os + +class OverlayDB(object): + def __init__(self, db, basedir): + self.db = db + self.basedir = basedir + + def getRawArticle(self, title): + p = os.path.join(self.basedir, title) + if os.path.isfile(p): + return unicode(open(p, 'rb').read(), 'utf-8') + return self.db.getRawArticle(title) + + def getTemplate(self, title, followRedirects=False): + p = os.path.join(self.basedir, title) + if os.path.isfile(p): + return unicode(open(p, 'rb').read(), 'utf-8') + return self.db.getTemplate(title, followRedirects=followRedirects) diff --git a/mwlib/parser.py b/mwlib/parser.py new file mode 100755 index 0000000..08869c5 --- /dev/null +++ b/mwlib/parser.py @@ -0,0 +1,1416 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import sys +import os +import re + +from mwlib.scanner import tokenize, TagToken, EndTagToken +from mwlib.log import Log + +log = Log("parser") + + +tag_li = TagToken("li") +tag_div = TagToken("div") + +class TokenSet(object): + def __init__(self, lst): + self.types = set() + self.values = set() + + for x in lst: + if isinstance(x, type): + self.types.add(x) + else: + self.values.add(x) + + def __contains__(self, x): + return x in self.values or type(x) in self.types + +FirstAtom = TokenSet(['TEXT', 'URL', 'SPECIAL', '[[', 'MATH', '\n', + 'BEGINTABLE', 'STYLE', 'TIMELINE', 'ITEM', 'URLLINK', + TagToken]) + +FirstParagraph = TokenSet(['SPECIAL', 'URL', 'TEXT', 'TIMELINE', '[[', 'STYLE', 'BEGINTABLE', 'ITEM', + 'PRE', 'MATH', '\n', 'PRE', 'EOLSTYLE', 'URLLINK', + TagToken]) + + +def show(out, node, indent=0): + print >>out, " "*indent, node + for x in node: + show(out, x, indent+1) + + +paramrx = re.compile("(?P<name>\w+) *= *(?P<value>(?:(?:\".*?\")|(?:(?:\w|[%:])+)))") +def parseParams(s): + def style2dict(s): + res = {} + for x in s.split(';'): + if ':' in x: + var, value = x.split(':', 1) + var=var.strip() + value = value.strip() + res[var] = value + + return res + + def maybeInt(v): + try: + return int(v) + except: + return v + + r = {} + for name, value in paramrx.findall(s): + if value.startswith('"'): + value = value[1:-1] + + if name=='style': + value = style2dict(value) + r['style'] = value + else: + r[name] = maybeInt(value) + return r + + + + +class Node(object): + caption = '' + + def __init__(self, caption=''): + self.children = [] + self.caption = caption + + def hasContent(self): + for x in self.children: + if x.hasContent(): + return True + return False + + def append(self, c, merge=False): + if c is None: + return + + if merge and type(c)==Text and self.children and type(self.children[-1])==Text: + self.children[-1].caption += c.caption + else: + self.children.append(c) + + def __iter__(self): + for x in self.children: + yield x + + def __repr__(self): + return "%s %r: %s children" % (self.__class__.__name__, self.caption, len(self.children)) + + def __eq__(self, other): + return (isinstance(other, self.__class__) + and self.caption == other.caption + and self.children == other.children) + + def __ne__(self, other): + return not(self==other) + + def allchildren(self): # name is broken, returns self, which is not a child + yield self + for c in self.children: + for x in c.allchildren(): + yield x + + def find(self, tp): + """find instances of type tp in self.allchildren()""" + return [x for x in self.allchildren() if isinstance(x, tp)] + + + def filter(self, fun): + for x in self.allchildren(): + if fun(x): + yield x + + def _asText(self, out): + out.write(self.caption) + for x in self.children: + x._asText(out) + + def asText(self, ): + from StringIO import StringIO + out = StringIO() + self._asText(out) + return out.getvalue() + +class Math(Node): pass +class Ref(Node): pass +class Item(Node): pass +class ItemList(Node): + numbered = False + def append(self, node, merge=False): + if not isinstance(node, Item): + c=Item() + c.append(node) + self.children.append(c) + else: + self.children.append(node) + +class Style(Node): pass +class Book(Node): pass +class Magic(Node): pass +class Chapter(Node): pass +class Article(Node): pass +class Paragraph(Node): pass +class Section(Node): pass +class Timeline(Node): pass +class TagNode(Node): pass +class PreFormatted(TagNode): pass +class URL(Node): pass +class NamedURL(Node): pass + + + +class _VListNode(Node): + def __init__(self, caption=''): + Node.__init__(self, caption) + self.vlist = {} + + def __repr__(self): + return "%s %r %s: %s children" % (self.__class__.__name__, self.caption, self.vlist, len(self.children)) + +class Table(_VListNode): + pass + +class Row(_VListNode): + pass + +class Cell(_VListNode): + pass + +class Caption(_VListNode): + pass + +class Link(Node): + target = None + + specialPrefixes = set([ + # English + "wikipedia", "wiktionary", "wikibooks", "wikisource", "wikiquote", "meta", "talk", + "commons", "wikinews", "template", "wikitravel", "help", + # German + "vorlage", + # Spanish + ]) + + imageKeywords = set([ + "image", "imagen", "bild" + ]) + + categoryKeywords = set([ + "category", "kategorie" + ]) + + from mwlib.lang import languages + colon = False + + def hasContent(self): + if self.target: + return True + return False + + def _specialize(self): + if not self.children: + return + + if type(self.children[0]) != Text: + return + + self.target = target = self.children[0].caption.strip() + del self.children[0] + if self.children and self.children[0] == Control("|"): + del self.children[0] + + pic = self.target + if pic.startswith(':'): + self.colon = True + + + + # pic == "Bild:Wappen_von_Budenheim.png" + + pic = pic.strip(': ') + if ':' not in pic: + return + + linktype, pic = pic.split(':', 1) + linktype = linktype.lower().strip(" :") + + if linktype in self.categoryKeywords: + self.__class__ = CategoryLink + self.target = pic.strip() + return + + if linktype in self.specialPrefixes: + self.__class__ = SpecialLink + self.target = pic.strip() + self.ns = linktype + + return + + if linktype in self.languages: + self.__class__ = LangLink + return + + + if linktype not in self.imageKeywords: + # assume a LangLink + log.info("Unknown linktype:", repr(linktype)) + if len(linktype)==2: + self.__class__ = LangLink + return + + + # pic == "Wappen_von_Budenheim.png" + + # WTB: See es.wikipedia.org/wiki/Provincia_de_Lima + #try: + # prefix, suffix = pic.rsplit('.', 1) + #except ValueError: + # return + #if suffix.lower() in ['jpg', 'jpeg', 'gif', 'png', 'svg']: + + self.__class__ = ImageLink + self.target = pic.strip() + + + + idx = 0 + last = [] + + while idx<len(self.children): + x = self.children[idx] + if x == Control("|"): + if idx: + last = self.children[:idx] + + del self.children[:idx+1] + idx = 0 + continue + + if not type(x)==Text: + idx += 1 + continue + + x = x.caption.lower() + + if x == 'thumb' or x=='thumbnail': + self.thumb = True + del self.children[idx] + continue + + if x in ['left', 'right', 'center', 'none']: + self.align = x + del self.children[idx] + continue + + if x == 'frame' or x=='framed' or x=='enframed': + self.frame = True + del self.children[idx] + continue + + + if x.endswith('px'): + # x200px + # 100x200px + # 200px + x = x[:-2] + width, height = (x.split('x')+['0'])[:2] + try: + width = int(width) + except ValueError: + width = 0 + + try: + height = int(height) + except ValueError: + height = 0 + + self.width = width + self.height = height + del self.children[idx] + continue + + idx += 1 + + if not self.children: + self.children = last + +class ImageLink(Link): + target = None + width = None + height = None + align = '' + thumb = False + frame = False + + def isInline(self): + return not bool(self.align or self.thumb or self.frame) + +class LangLink(Link): + pass + +class CategoryLink(Link): + pass + +class SpecialLink(Link): + pass + + +class Text(Node): + def __repr__(self): + return repr(self.caption) + + def __init__(self, txt): + self.caption = txt + self.children = [] + + def hasContent(self): + if self.caption.strip(): + return True + return False + +class Control(Text): + pass + +def _parseAtomFromString(s): + from mwlib import scanner + tokens = scanner.tokenize(s) + p=Parser(tokens) + try: + return p.parseAtom() + except Exception, err: + log.error("exception while parsing %r: %r" % (s, err)) + return None + + + +def parse_fields_in_imagemap(imap): + + if imap.image: + imap.imagelink = _parseAtomFromString(u'[['+imap.image+']]') + if not isinstance(imap.imagelink, ImageLink): + imap.imagelink = None + + # FIXME: the links of objects inside 'entries' array should also be parsed + + +def append_br_tag(node): + """append a self-closing 'br' TagNode""" + br = TagNode("br") + br.starttext = '<br />' + br.endtext = '' + node.append(br) + +class Parser(object): + def __init__(self, tokens, name=''): + self.tokens = tokens + self.pos = 0 + self.name = name + self.lastpos = 0 + self.count = 0 + + @property + def token(self): + t=self.tokens[self.pos] + if self.pos == self.lastpos: + self.count += 1 + if self.count > 500: + from mwlib.caller import caller + + raise RuntimeError("internal parser error: %s" % ((self.pos, t, caller()), )) + else: + self.count = 0 + self.lastpos = self.pos + + + return t + + + + @property + def left(self): + return self.pos < len(self.tokens) + + def next(self): + self.pos += 1 + + def parseAtom(self): + token = self.token + + if token[0]=='TEXT': + self.next() + return Text(token[1]) + elif token[0]=='URL': + self.next() + return URL(token[1]) + elif token[0]=='URLLINK': + return self.parseUrlLink() + elif token[0]=='SPECIAL': + self.next() + return Text(token[1]) + elif token[0]=='[[': + return self.parseLink() + elif token[0]=='MATH': + return self.parseMath() + elif token[0]=='\n': + self.next() + return Text(token[1]) + elif token[0]=='BEGINTABLE': + return self.parseTable() + elif token[0]=='STYLE': + return self.parseStyle() + elif token[0]=='TIMELINE': + return self.parseTimeline() + elif token[0]=='ITEM': + return self.parseItemList() + elif isinstance(token[0], TagToken): + return self.parseTagToken() + else: + raise RuntimeError("not handled: %s" % (token,)) + + def parseUrlLink(self): + u = self.token[1][1:] + n = Node() + n.append(Text("[")) + n.append(URL(u)) + + self.next() + + while self.left: + if self.tokens[self.pos:self.pos+2] == [(']]', ']]'), ('SPECIAL', u']')]: + self.tokens[self.pos:self.pos+2] = [('SPECIAL', ']'), (']]', ']]')] + + token = self.token + + + if token[0] == 'SPECIAL' and token[1]==']': + self.next() + n.__class__ = NamedURL + n.caption = u + del n.children[:2] + break + elif token[0] in FirstAtom: + n.append(self.parseAtom()) + else: + break + + return n + + + def parseArticle(self): + a=Article(self.name) + + while self.left: + token = self.token + if token[0] == 'SECTION': + a.append(self.parseSection()) + elif token[0]=='BREAK': + self.next() + elif token[0] in FirstParagraph: + a.append(self.parseParagraph()) + else: + log.info("in parseArticle: skipping", token) + self.next() + + return a + + def parseLink(self): + break_at = TokenSet(['BREAK', EndTagToken, 'SECTION']) + + obj = Link() + self.next() + while self.left: + token = self.token + if token[0] == ']]': + self.next() + break + elif token[0]=='SPECIAL' and token[1]==']': + self.next() + break + elif token[1] == '|' or token[1]=="||": + obj.append(Control('|')) + self.next() + elif token[0]=='TEXT' or token[0]=='SPECIAL' or token[0]=='\n': + obj.append(Text(token[1]), merge=True) + self.next() + elif token[0] in break_at: + break + elif token[0] in FirstAtom: + obj.append(self.parseAtom()) + elif token[1].startswith("|"): + obj.append(Control("|")) + obj.append(Text(token[1][1:])) + self.next() + else: + log.info("assuming text in parseLink", token) + obj.append(Text(token[1]), merge=True) + self.next() + + obj._specialize() + + return obj + + def parseTag(self): + token = self.token[0] + + n = TagNode(token.t) + if token.values: + n.values = token.values + n.vlist = parseParams(self.token[1]) + + n.starttext = token.text + n.endtext = u'</%s>' % token.t + self.next() + + if token.selfClosing: + return n + + + end = EndTagToken(token.t) + + while self.left: + token = self.token + if token[0]==end: + n.endtext = token[0].text + self.next() + break + elif token[0]=='BREAK': + self.next() + else: + if token[0] not in FirstParagraph: + log.warn("tag not closed", n, token) + break + n.append(self.parseParagraph()) + + return n + + def parsePRETag(self): + token = self.token[0] + if token.t.lower()=='pre': + n=PreFormatted() + else: + n=TagNode(token.t) + + n.vlist = parseParams(self.token[1]) + + end = EndTagToken(self.token[0].t) + self.next() + + txt = [] + while self.left: + token = self.token + if token[0]==end: + self.next() + break + txt.append(token[1]) + self.next() + + n.append(Text("".join(txt))) + return n + + parseCODETag = parsePRETag + parseSOURCETag = parsePRETag + def parseA7831D532A30DF0CD772BBC895944EC1Tag(self): + p = self.parseTag() + p.__class__ = Magic + return p + + parseREFTag = parseTag + parseREFERENCESTag = parseTag + + parseDIVTag = parseTag + parseSPANTag = parseTag + parseINDEXTag = parseTag + parseTTTag = parseTag + + parseH1Tag = parseTag + parseH2Tag = parseTag + parseH3Tag = parseTag + parseH4Tag = parseTag + parseH5Tag = parseTag + parseH6Tag = parseTag + + parseINPUTBOXTag = parseTag + + parseRSSTag = parseTag + + parseSTRIKETag = parseTag + parseCODETag = parseTag + parseDELTag = parseTag + parseINSTag = parseTag + parseCENTERTag = parseTag + parseSTARTFEEDTag = parseTag + parseENDFEEDTag = parseTag + parseCENTERTag = parseTag + + def parseGALLERYTag(self): + node = self.parseTag() + txt = "".join(x.caption for x in node.find(Text)) + #print "GALLERY:", repr(txt) + + children=[] + + lines = [x.strip() for x in txt.split("\n")] + for x in lines: + if not x: + continue + + # either image link or text inside + # FIXME: Styles and links in text are ignored! + n=_parseAtomFromString(u'[['+x+']]') + + if isinstance(n, ImageLink): + children.append(n) + else: + children.append(Text(x)) + + node.children=children + + return node + + def parseIMAGEMAPTag(self): + node = self.parseTag() + txt = "".join(x.caption for x in node.find(Text)) + #from mwlib import imgmap + #node.imagemap = imgmap.ImageMapFromString(txt) + + class FakeImageMap(object): + pass + + node.imagemap = FakeImageMap() + node.imagemap.entries = [] + node.imagemap.imagelink = None + match = re.search('Image:.*', txt) + + if match: + node.imagemap.image = match.group(0) + else: + node.imagemap.image = None + + parse_fields_in_imagemap(node.imagemap) + + #print node.imagemap + return node + + def parseSection(self): + s = Section() + + level = self.token[1].count('=') + s.level = level + closelevel = 0 + + self.next() + + title = Node() + while self.left: + token = self.token + + if token[0] == 'ENDSECTION': + closelevel = self.token[1].count('=') + self.next() + break + elif token[0] == '[[': + title.append(self.parseLink()) + elif token[0] == "STYLE": + title.append(self.parseStyle()) + elif token[0] == 'TEXT': + self.next() + title.append(Text(token[1])) + elif isinstance(token[0], TagToken): + title.append(self.parseTagToken()) + elif token[0] == 'URLLINK': + title.append(self.parseUrlLink()) + elif token[0] == 'MATH': + title.append(self.parseMath()) + else: + self.next() + title.append(Text(token[1])) + + s.level = min(level, closelevel) + if s.level==0: + title.children.insert(0, Text("="*level)) + s.__class__ = Node + else: + diff = closelevel-level + if diff>0: + title.append(Text("="*diff)) + elif diff<0: + title.children.insert(0, Text("="*(-diff))) + + s.append(title) + + + while self.left: + token = self.token + if token[0] == 'SECTION': + if token[1].count('=') <= level: + return s + + s.append(self.parseSection()) + elif token[0] in FirstParagraph: + s.append(self.parseParagraph()) + else: + log.info("in parseSection: skipping", token) + break + + return s + + def parseStyle(self): + end = self.token[1] + b = Style(self.token[1]) + self.next() + + break_at = TokenSet(['BREAK', '\n', 'ENDEOLSTYLE', 'SECTION', 'ENDSECTION', + 'BEGINTABLE', ']]', 'ROW', 'COLUMN', 'ENDTABLE', EndTagToken]) + + while self.left: + token = self.token + if token[0]=="STYLE": + if token[1]==end: + self.next() + break + else: + new = token[1] + if end=="'''''": + if token[1]=="''": + new = "'''" + else: + new = "''" + elif end=="''": + if token[1]=="'''": + new = "'''''" + elif token[1]=="'''''": + new = "'''" + elif end=="'''": + if token[1]=="''": + new = "'''''" + elif token[1]=="'''''": + new = "''" + + self.tokens[self.pos] = ("STYLE", new) + break + elif token[0] in break_at: + break + elif token[0] in FirstAtom: + b.append(self.parseAtom()) + else: + log.info("assuming text in parseStyle", token) + b.append(Text(token[1])) + self.next() + + return b + + + def parseColumn(self): + token = self.token + c = Cell() + + params = '' + if "|" in token[1] or "!" in token[1]: # not a html cell + # search for the first occurence of "||", "|", "\n" in the next tokens + # if it's a "|" we have a parameter list + self.next() + savepos = self.pos + + while self.left: + token = self.token + self.next() + if token[0] in ("\n", "BREAK", "[[", "ROW", "ENDTABLE"): + params = '' + self.pos = savepos + break + elif (token[0]=='SPECIAL' or token[0]=='COLUMN') and token[1]=='|': + break + params += token[1] + + c.vlist = parseParams(params) + + elif token[0]=='COLUMN': # html cell + params=parseParams(token[1]) + #print "CELLTOKEN:", token + #print "PARAMS:", params + c.vlist = params + self.next() + + + + while self.left: + token = self.token + if token[0] in ("COLUMN", "ENDTABLE", "ROW"): + break + + if token[0] == 'BEGINTABLE': + c.append(self.parseTable()) + elif token[0]=='SPECIAL' and token[1] == '|': + self.next() + elif token[0] == 'SECTION': + c.append(self.parseSection()) + elif token[0] in FirstParagraph: + c.append(self.parseParagraph()) + elif isinstance(token[0], EndTagToken): + log.info("ignoring %r in parseColumn" % (token,)) + self.next() + else: + log.info("assuming text in parseColumn", token) + c.append(Text(token[1])) + self.next() + + return c + + + def parseRow(self): + r = Row() + r.vlist={} + + token = self.token + params = '' + if token[0]=='ROW': + self.next() + if "|-" in token[1]: + # everything till the next newline/break is a parameter list + while self.left: + token = self.token + if token[0]=='\n' or token[0]=='BREAK': + break + else: + params += token[1] + self.next() + r.vlist = parseParams(params) + + else: + # html row + r.vlist = parseParams(token[1]) + + + while self.left: + token = self.token + if token[0] == 'COLUMN': + r.append(self.parseColumn()) + elif token[0] == 'ENDTABLE': + return r + elif token[0] == 'ROW': + return r + elif token[0] == 'BREAK': + self.next() + elif token[0]=='\n': + self.next() + else: + log.warn("skipping in parseRow: %r" % (token,)) + self.next() + return r + + def parseCaption(self): + token = self.token + self.next() + n = Caption() + params = "" + if token[1].strip().startswith("|+"): + # search for the first occurence of "||", "|", "\n" in the next tokens + # if it's a "|" we have a parameter list + savepos = self.pos + while self.left: + token = self.token + self.next() + if token[0] in ("\n", "BREAK", "[[", "ROW", "COLUMN", "ENDTABLE"): + params = '' + self.pos = savepos + break + elif token[0]=='SPECIAL' and token[1]=='|': + break + params += token[1] + + n.vlist = parseParams(params) + + while self.left: + token = self.token + if token[0] in ('TEXT' , 'SPECIAL', '\n'): + if token[1]!='|': + n.append(Text(token[1])) + self.next() + elif token[0] == 'STYLE': + n.append(self.parseStyle()) + elif isinstance(token[0], TagToken): + n.append(self.parseTagToken()) + elif token[0] == '[[': + n.append(self.parseLink()) + else: + break + return n + + def parseTable(self): + token = self.token + self.next() + t = Table() + + params = "" + if "{|" in token[1]: # not a <table> tag + # everything till the next newline/break is a parameter list + while self.left: + token = self.token + if token[0]=='\n' or token[0]=='BREAK': + break + else: + params += token[1] + self.next() + t.vlist = parseParams(params) + else: + t.vlist = parseParams(token[1]) + + while self.left: + token = self.token + if token[0]=='ROW' or token[0]=='COLUMN': + t.append(self.parseRow()) + elif token[0]=='TABLECAPTION': + t.append(self.parseCaption()) + elif token[0]=='ENDTABLE': + self.next() + break + elif token[0]=='\n': + self.next() + else: + log.warn("skipping in parseTable", token) + self.next() + #t.append(self.parseRow()) + + return t + + def parseMath(self): + self.next() + caption = u'' + while self.left: + token = self.token + self.next() + if token[0]=='ENDMATH': + break + caption += token[1] + return Math(caption) + + def parseTimeline(self): + t=Timeline() + self.next() + snippets = [] + while self.left: + token = self.token + self.next() + if token[0]=='TIMELINE': + break + snippets.append(token[1]) + t.caption = "".join(snippets) + return t + + def parseEOLStyle(self): + token = self.token + maybe_definition = False + if token[1]==';': + p=Style(";") + maybe_definition = True + elif token[1].startswith(':'): + p=Style(token[1]) + else: + p=Style(":") + + assert p + retval = p + + self.next() + + last = None + # search for the newline and replace it with ENDEOLSTYLE + for idx in range(self.pos, len(self.tokens)-1): + if self.tokens[idx][0]=='BREAK' or self.tokens[idx][0]=='\n': + last = idx, self.tokens[idx] + self.tokens[idx] = ("ENDEOLSTYLE", self.tokens[idx][1]) + break + + break_at = TokenSet(['ENDEOLSTYLE', 'BEGINTABLE', 'BREAK', EndTagToken]) + + while self.left: + token = self.token + if token[0] in break_at: + break + elif maybe_definition and token[1]==':': + self.next() + maybe_definition = False + retval = Node() + retval.append(p) + p = Style(":") + retval.append(p) + + elif token[0] in FirstAtom: + p.append(self.parseAtom()) + else: + log.info("in parseEOLStyle: assuming text", token) + p.append(Text(token[1])) + self.next() + + if last: + self.tokens[last[0]] = last[1] + + return retval + + def parseParagraph(self): + p = Node() + + while self.left: + token = self.token + if token[0]=='EOLSTYLE': + p.append(self.parseEOLStyle()) + elif token[0]=='PRE': + pre = self.parsePre() + if pre is None: + # empty line with spaces. handle like BREAK + p.__class__ = Paragraph + break + p.append(pre) + elif token[0] == 'BREAK': + self.next() + p.__class__ = Paragraph + break + elif token[0] == 'SECTION': + p.__class__ = Paragraph + break + elif token[0] == 'ENDSECTION': + p.append(Text(token[1])) + self.next() + elif token[0] in FirstAtom: + p.append(self.parseAtom()) + else: + break + + if not self.left: + p.__class__ = Paragraph + + if p.children: + return p + else: + return None + + def parseTagToken(self): + tag = self.token[0].t + try: + m=getattr(self, 'parse'+tag.upper()+'Tag') + except (AttributeError, UnicodeEncodeError): + t=Text(self.token[1]) + self.next() + return t + else: + return m() + + def parseEMTag(self): + return self._parseStyledTag(Style("''")) + + def parseITag(self): + return self._parseStyledTag(Style("''")) + + def parseBTag(self): + return self._parseStyledTag(Style("'''")) + + def parseSTRONGTag(self): + return self._parseStyledTag(Style("'''")) + + def parseBLOCKQUOTETag(self): + return self._parseStyledTag(Style(":")) + + def _parseStyledTag(self, style=None): + + token = self.token[0] + if style is None: + style = Style(token.t) + + b = style + end = EndTagToken(token.t) + start = TagToken(token.t) + self.next() + + + if token.selfClosing: + return style + + break_at = set(["ENDTABLE", "ROW", "COLUMN", "ITEM", "BREAK", "SECTION", "BEGINTABLE"]) + + while self.left: + token = self.token + if token[0] in break_at: + break + elif token[0]=='\n': + b.append(Text(token[1])) + self.next() + elif token[0]==end: + self.next() + break + elif isinstance(token[0], EndTagToken): + break + elif isinstance(token[0], TagToken): + if token[0]==start: + self.next() # 'Nuclear fuel' looks strange otherwise + break + b.append(self.parseTagToken()) + elif token[0] in FirstAtom: + b.append(self.parseAtom()) + else: + log.info("_parseStyledTag: assuming text", token) + b.append(Text(token[1])) + self.next() + + return b + + parseVARTag = parseCITETag = parseSTag = parseSUPTag = parseSUBTag = parseBIGTag = parseSMALLTag = _parseStyledTag + + def parseBRTag(self): + token = self.token[0] + n = TagNode(token.t) + n.starttext = token.text + n.endtext = u'' + self.next() + return n + + parseHRTag = parseBRTag + + def parseUTag(self): + token = self.token + if "overline" in self.token[1].lower(): + s = Style("overline") + else: + s = None + + return self._parseStyledTag(s) + + def parsePre(self): + p = n = PreFormatted() + token = self.token + p.append(Text(token[1])) + + self.next() + + # find first '\n' not followed by a 'PRE' token + last = None + for idx in range(self.pos, len(self.tokens)-1): + if self.tokens[idx][0] in ['ROW', 'COLUMN', 'BEGINTABLE', 'ENDTABLE', 'TIMELINE', 'MATH']: + return None + + if self.tokens[idx][0]=='BREAK': + break + + if self.tokens[idx][0]=='\n' and self.tokens[idx+1][0]!='PRE': + last = idx, self.tokens[idx] + self.tokens[idx]=('ENDPRE', '\n') + break + + + while self.left: + token = self.token + if token[0] == 'ENDPRE' or token[0]=='BREAK': + break + if token[0]=='\n' or token[0]=='PRE' or token[0]=='TEXT': + p.append(Text(token[1])) + self.next() + elif token[0] == 'SPECIAL': + p.append(Text(token[1])) + self.next() + elif isinstance(token[0], EndTagToken): + break + elif isinstance(token[0], TagToken): + if token[0] == tag_div: + break + + p.append(self.parseTagToken()) + elif token[0] in FirstAtom: + p.append(self.parseAtom()) + else: + log.info("in parsePre: assuming text", token) + p.append(Text(token[1])) + self.next() + + if last: + self.tokens[last[0]] = last[1] + + for x in p: + if not isinstance(x, Text): + return p + if x.caption.strip(): + return p + + return None + + + + def parseOLTag(self): + numbered = parseParams(self.token[1]).get('type', '1') + return self._parseHTMLList(numbered) + + def parseULTag(self): + return self._parseHTMLList(False) + + def parseLITag(self): + p = item = Item() + self.next() + break_at = TokenSet([EndTagToken, 'ENDTABLE', 'SECTION']) + while self.left: + token = self.token + if token[0] == '\n': + p.append(Text(token[1])) + self.next() + elif token[0] == 'EOLSTYLE': + p.append(self.parseEOLStyle()) + elif token[0]=='BREAK': + append_br_tag(p) + self.next() + elif token[0]==tag_li: + break + elif token[0]==EndTagToken("li"): + self.next() + break + elif token[0] in break_at: + break + elif token[0] in FirstAtom: + p.append(self.parseAtom()) + else: + log.info("in parseLITag: assuming text", token) + p.append(Text(token[1])) + self.next() + + return item + + + def _parseHTMLList(self, numbered): + lst = ItemList() + lst.numbered = numbered + + end = EndTagToken(self.token[0].t) + + self.next() + while self.left: + token = self.token + if token[0]==end: + self.next() + break + elif isinstance(token[0], TagToken): + lst.append(self.parseTagToken()) + elif token[0]=='ITEM': + lst.append(self.parseItemList()) + elif token[0] in FirstAtom: + lst.append(self.parseAtom()) + else: + log.info("assuming text in _parseHTMLList", token) + lst.append(Text(token[1])) + self.next() + + return lst + + + def parseItemList(self): + # actually this parses multiple nested item lists.. + items = [] + while self.left: + token = self.token + if token[0]=='ITEM': + items.append(self.parseItem()) + else: + break + + # hack + commonprefix = lambda x,y : os.path.commonprefix([x,y]) + + current_prefix = u'' + stack = [Node()] + + def append_item(parent, node): + if parent is stack[0]: + parent.append(node) + return + + if not parent.children: + parent.children.append(Item()) + + parent.children[-1].append(node) + + for item in items: + prefix = item.prefix.strip(":") + common = commonprefix(current_prefix, item.prefix) + + stack = stack[:len(common)+1] + + create = prefix[len(common):] + for x in create: + itemlist = ItemList() + itemlist.numbered = (x=='#') + append_item(stack[-1], itemlist) + stack.append(itemlist) + stack[-1].append(item) + current_prefix = prefix + + return stack[0] + + def parseItem(self): + p = item = Item() + p.prefix = self.token[1] + + self.token[1] + break_at = TokenSet(["ENDTABLE", "COLUMN", "ROW"]) + + self.next() + while self.left: + token = self.token + + if token[0] == '\n': + self.next() + break + elif token[0]=='BREAK': + break + elif token[0]=='SECTION': + break + elif isinstance(token[0], EndTagToken): + break + elif token[0] in break_at: + break + elif token[0] in FirstAtom: + p.append(self.parseAtom()) + else: + log.info("in parseItem: assuming text", token) + p.append(Text(token[1])) + self.next() + return item + + + def parse(self): + log.info("Parsing", repr(self.name)) + try: + return self.parseArticle() + except Exception, err: + log.error("error while parsing article", repr(self.name), repr(err)) + raise + +def main(): + #import htmlwriter + from mwlib.dummydb import DummyDB + db = DummyDB() + + for x in sys.argv[1:]: + input = unicode(open(x).read(), 'utf8') + from mwlib import expander + te = expander.Expander(input, pagename=x, wikidb=db) + input = te.expandTemplates() + + + tokens = tokenize(input, x) + + p=Parser(tokens, os.path.basename(x)) + r = p.parse() + + show(sys.stdout, r, 0) + + #hw = htmlwriter.HTMLWriter(htmlout) + +if __name__=="__main__": + main() diff --git a/mwlib/recorddb.py b/mwlib/recorddb.py new file mode 100755 index 0000000..fe895cf --- /dev/null +++ b/mwlib/recorddb.py @@ -0,0 +1,83 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import simplejson +import zipfile +from mwlib import uparser, parser +import mwlib.log +log = mwlib.log.Log("zip") + + +class RecordDB(object): + def __init__(self, db): + assert db is not None, "db must not be None" + self.db = db + self.articles = {} + self.templates = {} + + def getRawArticle(self, name, revision=None): + r = self.db.getRawArticle(name, revision=revision) + self.articles[name] = { + 'revision': revision, + 'content-type': 'text/x-wiki', + 'content': r, + 'url': self.db.getURL(name, revision=revision), + 'authors': self.db.getAuthors(name, revision=revision), + } + return r + + def getTemplate(self, name, followRedirects=False): + r = self.db.getTemplate(name, followRedirects=followRedirects) + self.templates[name] = { + 'content-type': 'text/x-wiki', + 'content': r, + } + return r + + +class ZipfileCreator(object): + def __init__(self, zf, wikidb=None, imgdb=None): + self.zf = zf + self.db = RecordDB(wikidb) + self.images = {} + self.imgdb = imgdb + + def addObject(self, name, value): + """ + @type name: unicode + + @type value: str + """ + + self.zf.writestr(name.encode('utf-8'), value) + + def addArticle(self, title, revision=None): + a = uparser.parseString(title, revision=revision, wikidb=self.db) + for x in a.allchildren(): + if isinstance(x, parser.ImageLink): + name = x.target + self.images[name] = {} + + def writeImages(self, size=None): + if self.imgdb is None: + return + + for name in sorted(self.images.keys()): + dp = self.imgdb.getDiskPath(name, size=size) + if dp is None: + continue + self.zf.write(dp, (u"images/%s" % name.replace("'", '-')).encode("utf-8")) + self.images[name]['url'] = self.imgdb.getURL(name, size=size) + license = self.imgdb.getLicense(name) + if license: + self.images[name]['license'] = license + + def writeContent(self): + self.addObject('content.json', simplejson.dumps(dict( + articles=self.db.articles, + templates=self.db.templates, + images=self.images, + ))) + diff --git a/mwlib/rendermath.py b/mwlib/rendermath.py new file mode 100755 index 0000000..10d6cdd --- /dev/null +++ b/mwlib/rendermath.py @@ -0,0 +1,144 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import os +import re +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + +from mwlib import texmap +import mwlib.log + +log = mwlib.log.Log("rendermath") + +latex = r""" +%% %(ident)s +\documentclass[%(fontsize)spt]{article} +%(extra_header)s +\usepackage{ucs} +\usepackage{amsmath} +\usepackage{amsfonts} +\usepackage{amssymb} + +%% \newcommand{\R}[0]{\mathbb{R}} + +\def\Alpha{{A{}}} +\def\Beta{{B{}}} +\def\Epsilon{{E{}}} +\def\Zeta{{Z{}}} +\def\Eta{{H{}}} +\def\Iota{{I{}}} +\def\Kappa{{K{}}} +\def\Mu{{M{}}} +\def\Nu{{N{}}} +\def\Rho{{P{}}} +\def\Tau{{T{}}} +\def\Chi{{C{}}} + +\usepackage[utf8x]{inputenc} +\usepackage[dvips]{graphicx} +\pagestyle{empty} +\begin{document} +%(source)s +\end{document} +""" + + + + + + +def mysystem(cmd): + err=os.system(cmd) + if err: + raise RuntimeError("exit code %s while running %r" % (err, cmd)) + +class Renderer(object): + basedir = os.path.expanduser("~/pngmath/") + + def __init__(self, basedir=None, lazy=True): + if basedir: + self.basedir = os.path.realpath(os.path.join(basedir, 'pngmath/')) + if not os.path.exists(self.basedir): + #os.makedirs(self.basedir) + pass + self.lazy = lazy + + def _render_file(self, name, format): + assert format in ('pdf', 'png', 'eps'), "rendermath: format %r not supported" % format + + texfile = os.path.join(self.basedir, name+'.tex') + srcbase = os.path.join(self.basedir, name) + + cwd = os.getcwd() + os.chdir(self.basedir) + try: + mysystem("latex -interaction=batchmode %s" % texfile) + mysystem("dvips -E %s.dvi -o %s.ps" % (srcbase, srcbase)) + if format=='png': + mysystem("convert +adjoin -transparent white -density 300x300 %s.ps %s.png" % (srcbase, srcbase)) + elif format=='pdf': + mysystem("epstopdf %s.ps" % srcbase) + elif format=='eps': + os.rename("%s.ps" % srcbase, "%s.eps" % srcbase) + finally: + for x in ['.dvi', '.aux', '.log', '.ps']: + p = os.path.join(self.basedir, name+x) + try: + os.unlink(p) + except OSError, err: + pass + + os.chdir(cwd) + + def _normalizeLatex(self, latexsource): + latexsource = re.compile("\n+").sub("\n", latexsource) + return latexsource + + def convert(self, latexsource, lazy=True, format='pdf', addMathEnv=True): + assert format in ('pdf', 'png', 'eps'), "rendermath: format %r not supported" % format + latexsource = self._normalizeLatex(latexsource) + if addMathEnv: + latexsource = '$' + latexsource + '$' + if format in ('pdf', 'eps'): + extra_header = '\usepackage{geometry}\n\geometry{textwidth=3.0in}\n' + fontsize = 10 + else: + fontsize = 12 + extra_header = '' + + latexsource = texmap.convertSymbols(latexsource) + + source = latex % dict(source=latexsource, + ident=format, + fontsize=fontsize, + extra_header=extra_header) + + m=md5() + m.update(source) + name = m.hexdigest() + + srcbase = os.path.join(self.basedir, name) + texfile = os.path.join(self.basedir, name+'.tex') + outfile = os.path.join(self.basedir, name+'.'+format) + + if os.path.exists(outfile): + return outfile # FIXME + + open(texfile, 'w').write(source) + + if not lazy: + self._render_file(name, format) + + + return outfile + + def render(self, latexsource, lazy=None, addMathEnv=True): + if lazy is None: + lazy = self.lazy + return self.convert(latexsource, lazy=lazy, format='png', addMathEnv=addMathEnv) + diff --git a/mwlib/resources/__init__.py b/mwlib/resources/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/mwlib/resources/__init__.py diff --git a/mwlib/resources/__init__.pyc b/mwlib/resources/__init__.pyc Binary files differnew file mode 100644 index 0000000..d46b5c5 --- /dev/null +++ b/mwlib/resources/__init__.pyc diff --git a/mwlib/resources/outgoing_link.gif b/mwlib/resources/outgoing_link.gif Binary files differnew file mode 100644 index 0000000..d508fc4 --- /dev/null +++ b/mwlib/resources/outgoing_link.gif diff --git a/mwlib/resources/pedia.css b/mwlib/resources/pedia.css new file mode 100644 index 0000000..3e814ef --- /dev/null +++ b/mwlib/resources/pedia.css @@ -0,0 +1,1250 @@ +/* ############# misc #################*/
+body, p, li, ul, a {
+ margin:0px;
+ padding:0px;
+}
+
+body {
+ background-color:#F4F5E7;
+ font-family:arial;
+ font-size:10pt;
+}
+
+img {
+ border:0;
+ }
+
+* {
+ font-size:10pt;
+ }
+
+h1, h1 * {
+ font-size:20pt;
+ }
+
+h2, h2 * {
+ font-size:14pt;
+ }
+
+h3, h3 * {
+ font-size:13pt;
+ }
+
+.invisible {
+ display: none;
+}
+
+/* ############### base ########### */
+
+* a, * a:hover, * a:visited, * a:active {
+ color:rgb(29,48,161);
+ }
+
+
+
+#header {
+ text-align:right;
+ height:74px;
+ padding:0px;
+ margin:0px;
+ table-layout:fixed;
+ overflow:hidden;
+}
+
+#main_nav {
+ padding-top:5px;
+ margin-right:20px;
+ height:47px !important;
+ /* height:50px !important; */
+ height:54px;
+ /* height:57px; */
+}
+
+#main_nav a {
+ font-size:10pt;
+ font-weight:bold;
+ color: black;
+ text-decoration:none;
+}
+
+#main_nav ul {
+ line-height:12pt;
+}
+
+#main_nav li {
+ display:inline;
+}
+
+#tab_nav {
+ position:absolute;
+ /* top:60px !important;
+ top:57px; */
+
+ top:66px !important;
+ top:63px;
+ right:30px;
+ border:0px;
+ }
+
+
+#errorarea, #busyarea {
+ position:absolute;
+ top:40%;
+ left:30%;
+ width:40%;
+ height:10%;
+ border:2px solid rgb(196,196,196);
+ background-color: rgb(255,136,0);
+ text-align:center;
+ }
+
+head:first-child+body #busyarea { /* this is a hack to prevent IE to read this*/
+ position:fixed;
+ }
+
+#busyarea {
+ /* background-color:#F4F5E7; */
+ background-color:white;
+ }
+
+#errorarea {
+ background-color: rgb(255,136,0);
+ }
+
+a#errorclose {
+ color:black;
+ font-weight:bold;
+ }
+
+/* ######### misc table formatting ################*/
+
+/*
+#managebook table {
+ padding:0px;
+ margin:0px;
+ border-collapse:collapse;
+ border-spacing:0px;
+ }
+*/
+
+.roundbox, .roundbox tr, .roundbox td, .roundbox img, .clean, #tab_nav img, .snippet img {
+ padding:0px;
+ margin:0px;
+ border:0px;
+ border-collapse:collapse;
+ border-spacing:0px;
+ /* border-style:hidden; */
+}
+
+.tablefull {
+ margin:0px;
+ border:0px;
+ border-spacing:0px;
+ width:100%;
+ padding:5px;
+}
+
+/*
+.roundbox td {
+ background-color:white;
+}
+*/
+
+.roundboxContent {
+ background-color:white;
+ }
+
+/* ################## managebook ################## */
+
+#book_nav {
+ line-height:16pt;
+ font-size:10pt;
+}
+
+#book_nav input {
+ margin:0px;
+ vertical-align:bottom;
+ font-size:10pt;
+ width:220px;
+}
+
+ .link {
+ color:black;
+ margin-right:15px;
+}
+#book_nav img {
+ border:0px;
+ padding:0px;
+ margin:0px;
+ margin-left:10px;
+ margin-right:10px;
+ vertical-align:bottom;
+}
+.info li {
+ list-style-type:square;
+}
+
+#info li {
+ list-style-type:square;
+ margin-left:35px;
+}
+
+#info h2 {
+ margin:10px;
+ margin-top:15px;
+ }
+
+#info h2 a {
+ color:black;
+ }
+
+#info h3 {
+ margin-bottom:5px;
+ }
+
+#info p {
+ margin:10px;
+ }
+
+#tab_nav .info {
+ border:1px solid rgb(128,128,128);
+ font-size:8pt;
+ padding-left:20px;
+}
+
+.no_list_style li{
+ list-style-type:none !important;
+}
+
+#info h3 {
+ margin-left:10px;
+ }
+
+/* ################## main ################### */
+
+#main {
+ margin-top:0px;
+ margin-right:0px;
+}
+
+#main_content {
+ padding-left:10px;
+ padding-right:0px;
+ padding-top:5px;
+}
+
+#fromsearch {
+ margin-right:350px;
+ margin-bottom:10px;
+ margin-left:10px;
+ /* background-color: rgb(200, 220, 255); */
+ background-color:rgb(240,240,240);
+ border:2px solid rgb(251,88,33);
+ padding:5px;
+ }
+
+#wikipagecontent {
+ padding: 0px 10px;
+ }
+
+#add_pages {
+ float:right;
+ margin:0px;
+ background-color:white;
+ border:0px;
+}
+
+#add_pages img {
+ border:0px;
+ margin:5px 10px;
+}
+
+/* ******************** wikipedia article formatting ******************* */
+
+#main {
+ font-family: "Trebuchet MS", Trebuchet, Verdana, sans-serif;
+ /* font-family: Palatino Linotype, Book Antiqua, Palatino; */
+ font-size:10pt;
+ overflow:hidden;
+}
+
+#main h1 {
+ font-size:20pt;
+ margin-bottom:30px;
+ margin-right:10px;
+ border-bottom:1px solid rgb(128,128,128);
+}
+
+#main h2 {
+ z-index:-1;
+ font-size:14pt;
+ margin-right:20px;
+ border-bottom:1px solid rgb(128,128,128);
+}
+
+#main h3 {
+ margin-top:10px;
+ margin-bottom:5px;
+ font-size:13pt;
+}
+
+
+#main p {
+ margin-top:5px;
+}
+
+#main a {
+ text-decoration:none;
+}
+
+#main a:hover {
+ text-decoration:underline;
+}
+
+#main ul {
+ margin: 5px 15px;
+}
+#main ul li {
+ list-style-type:square;
+ margin-left:20px;
+}
+
+
+#main table {
+ empty-cells:show;
+ background-color:white;
+ margin:10px;
+}
+
+
+
+#main th {
+ font-weight:bold;
+ }
+
+
+#main .deadlink, #main .deadlink:hover {
+ text-decoration:none;
+ color:black;
+ }
+
+
+#main .infobox {
+ float:right;
+ border:1px solid grey;
+ padding:2px;
+ }
+
+#main .infobox td {
+ padding:2px 4px;
+ }
+
+#main .bordered {
+ border-collapse:collapse;
+ }
+
+#main .bordered td {
+ border:1px solid grey;
+ }
+
+#main .borderless *, #main .borderless {
+ border:0 !important;
+ }
+
+
+#main .infobox * {
+ font-size: 8pt;
+ }
+
+#main sub, #main sup {
+ font-size: 8pt;
+ margin-right:3px;
+ }
+
+
+#main .small * {
+ font-size:8pt;
+ }
+
+
+.border {
+ border:1px solid rgb(128,128,128);
+ border-collapse:collapse;
+}
+
+.image {
+ margin:10px;
+ padding:2px;
+ border:1px solid rgb(128,128,128);
+ background:white;
+}
+
+.clear {
+ clear:both;
+ }
+
+.right {
+ clear:right;
+ float:right;
+}
+
+/*
+.right + .right {
+ clear:right;
+ }
+*/
+
+.left {
+ clear:left;
+ float:left;
+}
+
+/*
+.left + .left {
+ clear:left;
+ }
+*/
+
+#main .formula {
+ vertical-align:middle;
+ }
+
+
+.imagecaption {
+ display:block;
+ font-size:8pt;
+ padding:2px;
+}
+
+.imagecaption *{
+ font-size:8pt;
+}
+
+/* .wikitable, .wikitable tr, */
+
+.wikitable {
+ border-collapse:collapse;
+}
+
+.wikitable td {
+ border:1px solid black;
+ border-spacing:0px;
+ padding:0px 2px;
+ }
+
+.toccolours {
+ border:1px solid black;
+ }
+
+
+/* ###### misc... ##### */
+
+.addButton img {
+ width:12px;
+ height:12px;
+ border:0;
+ margin-right:10px;
+ }
+
+.addButton div {
+ margin-right:10px;
+ width:12px;
+ display:inline;
+ }
+
+
+.addButton a {
+ margin-left:0px;
+ }
+
+
+/* ############## pagelist ####################### */
+
+#pagelist {
+ padding:10px;
+ }
+
+#collTitle {
+ /* margin-left:10px; */
+ font-size:12pt;
+ font-weight:bold;
+ border-bottom:1px solid rgb(128,128,128);
+ }
+
+#collEditTable {
+ margin:10px 0px;
+ }
+
+.box_content {
+ margin:10px 0px;
+ font-size:9pt;
+}
+
+#collInfo {
+ margin-top:10px;
+ font-size:10pt;
+ }
+
+.del_col {
+ /* width:2em; */
+ width:21px;
+ padding:0px 2px;
+ }
+
+.priceInfo {
+ margin-bottom:10px;
+ font-size:8pt;
+ }
+
+#collNumPages {
+ margin-top:10px;
+}
+
+#collNumPages, #collPrice {
+ font-weight:bold;
+ }
+
+#clearbook {
+ margin-top:10px;
+ }
+
+
+div.uitable table{
+ border-collapse: collapse;
+ /* border:1px solid rgb(196,196,196); */
+ cursor:pointer;
+}
+
+div.uitable table tbody tr {
+ padding:3px;
+ margin:3px;
+}
+
+/*
+div.uitable table tbody tr.ui_hover td{
+ background-color:#BACFE4;
+}
+*/
+
+div.uitable table tbody tr.ui_active td {
+ /* background-color:#F4F5E7; */
+ font-weight:bold;
+
+}
+
+div.uitable td {
+ padding:0px 2px;
+ }
+
+div.uitable td:hover {
+ text-decoration:underline;
+ background-color:#BACFE4;
+ }
+
+
+/* ############################# index.html ########################### */
+
+#home_left, #home_right {
+ padding:0px 10px;
+ }
+
+#home_right span {
+ /* font-style:italic; */
+ border-bottom:1px solid rgb(128,128,128);
+
+}
+
+#home_right .heading {
+ width:450px;
+ border-bottom:1px solid rgb(128,128,128);
+ }
+
+
+#home_right li {
+ margin-left:20px;
+ font-style:normal;
+ font-weight: normal;
+ }
+
+#home_left p, #home_right p {
+ margin:0px 0px;
+ }
+
+#home_right h1 {
+ font-size:14pt;
+ /* color:rgb(64,64,64); */
+ color:rgb(32,32,32);
+ margin-right:15px;
+ margin-top:0px;
+ text-decoration:underline;
+ }
+
+#home_right h2 {
+ font-size:14pt;
+ color:rgb(64,64,64);
+ color:rgb(32,32,32);
+ margin-right:15px;
+ display:inline;
+ }
+
+#home_right ul {
+ margin-bottom:10px;
+ list-style-type:square;
+ }
+
+
+#home_left h2 {
+ font-size:14pt;
+ color:rgb(64,64,64);
+ color:rgb(32,32,32);
+ margin-top:5px;
+ margin-bottom:5px;
+ }
+
+#home_left li {
+ list-style-type:none;
+ margin-bottom:5px;
+ margin-left:0px;
+ padding-left:0px;
+ }
+
+#home_left .title {
+ font-weight:bold;
+ font-style:italic;
+ }
+
+#home_left td {
+ padding:3px 2px;
+ }
+
+
+
+#finish_book {
+ line-height:18pt;
+ }
+
+#finish_book .label {
+ font-weight:bold;
+ vertical-align:bottom;
+ text-align:right;
+ padding-right:5px;
+ }
+
+#finish_book .input {
+ vertical-align:bottom;
+ text-align:left;
+ padding-left:5px;
+ }
+
+#finish_book input {
+ padding: 0px 5px;
+ margin:2px 0px;
+ width:400px;
+ }
+
+#booktitle {
+ font-size:14pt;
+}
+
+#bookeditor {
+ font-size:12pt;
+ }
+
+
+#finish_book table {
+ margin-bottom:5px;
+}
+
+#wp_footer {
+ clear:both;
+ float:right;
+ margin-top:10px;
+ margin-bottom:10px;
+ padding:5px;
+ /* border:2px solid rgb(230,230,230); */
+ border:2px solid rgb(251,88,33);
+ background-color:rgb(240,240,240);
+ }
+
+/* ++++++++++++++++++ order ++++++++++++++ */
+
+#order_pay h2 {
+ margin-top:0px;
+ }
+
+#order_content h2, #order_content h3, #showbook_content h2, #showbook_content h3 {
+ margin:5px 0px;
+ }
+
+#order_content h3, #showbook_content h3 {
+ margin-top:15px;
+ }
+
+
+#order_content li, #showbook_content li {
+ margin-right:10px;
+ list-style-type:none;
+ border-bottom:1px dotted grey;
+ }
+
+#order_content, #order_pay, #showbook_content {
+ margin:10px;
+ }
+
+#order_content a, #showbook_content a {
+ text-decoration:none;
+ }
+
+#order_content a:hover, #showbook_content a:hover {
+ text-decoration:underline;
+ }
+
+#cost .label, #ship .label, #contact_form .label {
+ text-align:right;
+ vertical-align:top;
+ }
+#cost .value #ship .value, #contact_form .value {
+ text-align:left;
+ }
+
+#cost td, #ship td, #contact_form td {
+ padding:2px 5px;
+ }
+
+#cost {
+ border:1px solid grey;
+ }
+
+#ship input, #ship textarea, #ship select {
+ width:250px;
+ }
+
+#terms_ok {
+ margin-left:10px;
+ }
+
+#terms_ok input {
+ margin-left:0px;
+
+ }
+
+#contact_form input, #contact_form select {
+ width: 250px;
+ }
+
+ #contact_form textarea {
+ width:400px;
+ }
+
+#contact h2 {
+ margin-top:0px;
+ }
+
+
+.tooltip {
+ background-color: rgb(255, 250, 200);
+ border: 1px solid black;
+ position: absolute;
+ z-index: 10000;
+ width: 200px;
+ padding:2px;
+ text-align:center;
+}
+
+.snippet {
+ background-color: rgb(200, 220, 255);
+ border: 1px solid black;
+ padding: 5px;
+ z-index: 10000;
+ position: absolute;
+ font-size: 10px;
+}
+
+#snippetid, #snippetid * {
+ font-size: 10pt;
+ font-style:normal;
+ font-weight:normal;
+}
+
+#snippetid em {
+ font-style:italic;
+ }
+
+#snippetid strong {
+ font-weight:bold;
+ }
+
+
+.hasborder {
+ border: 2px solid rgb(220,220,220);
+}
+
+.noborder {
+ border-width: 0px;
+ border: 2px solid white;
+}
+
+.greyedout {
+ color: rgb(127, 127, 127);
+}
+
+.notgreyedout {
+ color: black;
+}
+
+#terms dt {
+ margin-top:10px;
+ font-weight:bold;
+ }
+
+#terms .toc li {
+ margin-left:15px;
+ list-style-type:none;
+ }
+
+#terms p {
+ margin-top:10px;
+
+ }
+
+
+#terms p, #terms dt, #terms dd, #terms li, #terms a, #terms strong{
+ font-size:8pt;
+ }
+
+#terms h2, #terms h2 a {
+ font-size:12pt;
+ color:black;
+ }
+
+#terms h3 {
+ font-size:10pt;
+ }
+
+#terms li {
+ list-style-type:square;
+ }
+
+#refreshproposals {
+ padding:5px;
+ margin:5px;
+ margin-right:0px;
+ }
+
+
+.site_link {
+ border-bottom:1px dotted black;
+ color:black;
+ text-decoration:none;
+}
+
+.site_link:hover {
+ text-decoration:none !important;
+ }
+
+
+.topcats, .topcats * {
+ font-weight:bold;
+ list-style-type:none !important;
+ margin:0px;
+ padding:0px;
+ font-size:12pt;
+ border-bottom:1px solid rgb(128,128,128);
+ margin-top:5px;
+ }
+
+.topcats {
+ margin-right:5px;
+}
+
+.subcats {
+ margin-right:0px;
+ }
+
+
+#preview {
+ margin:10px;
+ }
+
+#preview h1 {
+ margin-top:0px;
+ margin-bottom:10px;
+ }
+
+#adobe_info {
+ float:right;
+ width:33%;
+ border:1px solid rgb(128,128,128);
+ border:1px solid rgb(196,196,196);
+ margin:10px;
+ margin-right:0px;
+ margin-top:0px;
+ padding:5px;
+
+ }
+
+#generating {
+ margin:0px;
+ }
+
+#finished p {
+ margin-bottom:10px;
+ }
+
+#affiliate {
+ padding:5px;
+ padding-top:0px;
+ }
+#affiliate p{
+ margin-bottom:10px;
+ }
+
+#affiliate li {
+ list-style-type:square;
+ margin-left:25px;
+ }
+#affiliate ul {
+ margin-bottom:10px;
+}
+
+#affiliatetable td{
+ padding:10px;
+ }
+
+#affiliatetable li {
+ list-style-type:none;
+ }
+
+#affiliatetable input {
+ margin-left:0px;
+ }
+
+.affiliatebox {
+ float:left;
+ width:45%;
+ margin:10px;
+ /* padding:10px; */
+ /* border:2px solid rgb(210,210,210); */
+ /* border:2px solid rgb(251,88,33); */
+ }
+
+.affiliateboxcontent {
+ margin:10px;
+ }
+
+.affiliatebox h2 {
+ margin:0px;
+ margin-bottom:10px;
+ }
+
+.affiliatebox ul li {
+ list-style-type:square;
+ margin-left:15px;
+ }
+
+.affiliatebox .nobullets li {
+ list-style-type:none;
+ margin-left:0px;
+ }
+
+
+
+#welcome {
+ padding:10px;
+ padding-top:5px;
+ margin:0px;
+ }
+
+#welcome h1 {
+ margin:0px;
+ margin-bottom:10px;
+ font-size:16pt;
+ color:rgb(32,32,32);
+ }
+
+
+#home_howto td{
+ padding:5px;
+ vertical-align:top;
+ }
+
+#home_howto .step_num {
+ font-size:20pt;
+ color:rgb(128,128,128);
+ float:left;
+ margin:5px;
+ }
+
+#home_howto li {
+ list-style-type:square;
+ margin-left:20px;
+ }
+
+#news .newsitem {
+ margin: 10px;
+ max-width:500px;
+ }
+
+#news .newsitem h3 {
+ font-size:12pt;
+ margin-bottom:5px;
+ margin-top:10px;
+ }
+
+.newsitem + .newsitem {
+ border-top:1px dotted rgb(128,128,128);
+ }
+
+#news .date {
+ float:right;
+ margin-top:5px;
+ margin-left:10px;
+ font-size:8pt;
+ }
+
+.deadlink {
+ color:black;
+}
+
+.smalltext, .smalltext em, .smalltext * {
+ font-size:8pt !important;
+ }
+
+.template_warningsmart {
+ border:3px solid red;
+}
+.template_warningmanual{
+ border:3px solid orange;
+ }
+
+.template_warningsmart * {
+ background-color:rgb(210,210,210);
+ }
+
+.template_warningmanual * {
+ background-color:rgb(230,230,230);
+ }
+
+
+ #startcats {
+ padding:5px;
+ }
+
+ #startcats h2, #startcats h3 {
+ margin:5px 0px;
+ }
+
+#startcats .topcats {
+ margin-bottom:5px;
+}
+
+#startcats .topcats *, #startcats .subcats * {
+ text-decoration:none;
+ }
+
+#startcats .topcats a {
+ border-bottom:0;
+ }
+
+.startcatbox {
+ width:30%;
+ float:left;
+ padding:10px;
+ padding-bottom:0px;
+ }
+
+.startcatbox h3 {
+ border-bottom:1px solid rgb(128,128,128);
+ }
+
+
+#lowpagecount, #highpagecount {
+ margin:10px 0px;
+ padding:10px;
+ border:2px solid rgb(251,88,33);
+ background-color:rgb(240,240,240);
+ }
+
+#highpagecount {
+ margin-left:10px;
+ }
+
+
+#examplebook {
+ padding:10px;
+ }
+
+#examplebook li {
+ list-style-type: none;
+ text-align:center;
+ margin-bottom:5px;
+ }
+
+#examplebook .book_img img {
+ border:1px dotted rgb(128,128,128);
+ padding:2px;
+ }
+
+#examplebook .info {
+ float:right;
+ border:1px dotted rgb(128,128,128);
+ padding:5px;
+ }
+
+#examplebook .navigation {
+ position:absolute;
+ margin:15px 0px;
+ border:1px dotted rgb(128,128,128);
+ padding:5px 15px;
+ }
+
+#examplebook .navigation li {
+ text-align:left;
+ }
+
+#examplebook .navigation a {
+ text-decoration:none;
+ }
+
+#examplebook .navigation a:hover {
+ text-decoration:underline;
+ }
+
+
+
+
+
+
+/* ######## pssearch */
+
+
+
+.pssnippet strong {
+ font-weight:normal;
+ }
+
+.pssnippet li {
+ margin-left:15px;
+ }
+
+.pssnippethl {
+ background-color: rgb(196, 224, 255);
+}
+
+
+#pssearch a {
+ color:rgb(28,53,110);
+ }
+
+/*
+#pssearch li {
+ margin-bottom:10px;
+ list-style-type:square;
+ }
+*/
+
+li.result {
+ margin-bottom:10px;
+ list-style-type:square;
+ }
+
+#pssearch .details, #pssearch .details * {
+ font-size:8pt;
+ }
+
+.marginbox {
+ padding:5px 10px;
+ }
+
+#searchresultlist .result {
+ margin: 0px 5px;
+ padding: 8px 0px;
+ list-style-type:none;
+ border-bottom:1px dotted rgb(128,128,128);
+ /* border-top:1px dotted rgb(128,128,128);*/
+ /*background:rgb(245, 250, 250);*/
+}
+
+/*
+.alternate {
+ background: rgb(234, 238, 245);
+}
+*/
+
+.clustered {
+ margin-left: 40px;
+}
+
+.resultdetails * {
+ text-align:middle;
+ }
+
+/* display styles for snippets search result snippet*/
+.pssnippet hr {
+ display:none;
+}
+
+.pssnippet li {
+ margin-left:15px;
+}
+
+/* end filter */
+
+#categories li {
+ list-style-type: none;
+}
+
+#categories h2 {
+ margin:0px;
+ font-size: 12pt;
+ }
+
+#categories a:hover {
+ background-color:rgb(1,160,199);
+ color:white;
+ text-decoration:none;
+}
+
+#pssearch .subcats {
+ margin-bottom:0px;
+ margin-left:15px;
+ list-style-type:none;
+}
+
+#searchselection {
+ margin-left:10px;
+ vertical-align:middle;
+}
+
+#searchselection input {
+ color:green;
+ margin-left:10px;
+ margin-right:5px;
+ vertical-align:middle;
+}
+
+.searchresulturl {
+ color: rgb(0, 96, 120);
+}
+
+.psformat {
+ font-size: 8pt;
+}
+
+.psrf {
+ float: right;
+}
+
+.pscategorylink, .psarticlelink {
+ white-space: nowrap;
+}
+
+.matchedcategories {
+ border: 1px dotted grey;
+ /*background-color: rgb(245, 245, 245);*/
+ margin-left: 10px;
+ margin-right: 10px;
+ margin-bottom: 15px;
+ padding: 4px;
+}
+
diff --git a/mwlib/sanitychecker.py b/mwlib/sanitychecker.py new file mode 100644 index 0000000..78d25b8 --- /dev/null +++ b/mwlib/sanitychecker.py @@ -0,0 +1,205 @@ +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. +""" +class for defining DTD-Like Rules for the tree +""" +from advtree import Article + +from mwlib.log import Log +log = Log("sanitychecker") + +# ----------------------------------------------------------- +# Constraints +# ----------------------------------------------------------- + +class ConstraintBase(object): + def __init__(self, *klasses): + self.klasses = klasses + + def test(self, nodes): + return True,None # passed + + def __repr__(self): + return "%s(%s)" %(self.__class__.__name__, ", ".join(k.__name__ for k in self.klasses)) + + +class Forbid(ConstraintBase): + "forbid any of the classes" + def test(self, nodes): + for n in nodes: + if n.__class__ in self.klasses: + return False, n + return True, None + + +class Allow(ConstraintBase): + "allow only these classes" + def test(self, nodes): + for n in nodes: + if not n.__class__ in self.klasses: + return False, n + return True, None + + +class Require(ConstraintBase): + "require any of these classes" + def test(self, nodes): + for n in nodes: + if n.__class__ in self.klasses: + return True, n + return False, None + +class Equal(ConstraintBase): + "node classes and their order must be equal to these klasses" + def test(self, nodes): + if len(nodes) != len(self.klasses): + return False, None # FIXME what could we report? + for i,n in enumerate(nodes): + if n.__class__ != self.klasses[i]: + return False, n + return True, None + + +# ----------------------------------------------------------- +# Rules regarding [Children, AllChildren, Parents, ...] +# ----------------------------------------------------------- + +class RuleBase: + def __init__(self, klass, constraint): + self.klass = klass + self.constraint = constraint + + def _tocheck(self, node): + return [] + + def test(self, node): + if node.__class__ == self.klass: + return self.constraint.test( self._tocheck(node) ) + return True, None + + def __repr__(self): + return "%s(%s, %r)" %(self.__class__.__name__, self.klass.__name__, self.constraint) + +class ChildrenOf(RuleBase): + def _tocheck(self, node): + return node.children + +class AllChildrenOf(RuleBase): + def _tocheck(self, node): + return node.getAllChildren() + +class ParentsOf(RuleBase): + def _tocheck(self, node): + return node.parents + +class ParentOf(RuleBase): + def _tocheck(self, node): + if node.parent: + return [node.parent] + return [] + +class SiblingsOf(RuleBase): + def _tocheck(self, node): + return node.siblings + + + +# example custom rules + +class RequireChild(RuleBase): + + def __init__(self, klass): + self.klass = klass + + def __repr__(self): + return "%s(%s)" %(self.__class__.__name__, self.klass.__name__) + + def test(self, node): + if node.__class__ == self.klass: + if not len(node.children): + return False, node + return True, None + + + + +# ----------------------------------------------------------- +# Callbacks +# ----------------------------------------------------------- +""" +callbacks get called if added to rules +callback return values should be: + * True if it modified the tree and the sanity check needs to restart + * False if the tree is left unmodified +""" +class SanityException(Exception): + pass + +def exceptioncb(rule, node=None, parentnode=None): + raise SanityException("%r err:%r" %(rule, node or parentnode) ) + +def warncb(rule, node=None, parentnode=None): + log.warn("%r node:%r parent:%r" %(rule, node, parentnode)) + return False + +def removecb(rule, node=None, parentnode=None): + assert node and node.parent + node.parent.removeChild(node) + return True + + + +# ----------------------------------------------------------- +# Container for sanity rules +# ----------------------------------------------------------- + +class SanityChecker(object): + + def __init__(self): + self.rules = [] + + def addRule(self, rule, actioncb=exceptioncb): + self.rules.append((rule, actioncb)) + + def check(self, tree): + """ + check each node with each rule + on failure call callback + """ + modified = True + while modified: + modified = False + for node in tree.allchildren(): + #if node.__class__ == Article: + # log.info("checking article:", node.caption.encode('utf-8')) + for r,cb in self.rules: + passed, errnode = r.test(node) + if not passed and cb: + if cb(r, errnode or node): + modified = True + break + if modified: + break + +def demo(tree): + "for documentation only, see tests for more demos" + from mwlib.advtree import Table, Row, Cell, Text, ImageLink, PreFormatted + + sc = SanityChecker() + rules = [ChildrenOf(Table, Allow(Row)), + ChildrenOf(Row, Allow(Cell)), + AllChildrenOf(Cell, Require(Text, ImageLink)), + AllChildrenOf(Cell, Forbid(PreFormatted)), + ChildrenOf(PreFormatted, Equal(Text)), + ] + + def mycb(rule, node=None, parentnode=None): + print "failed", rule, node or parentnode + modifiedtree = False + return modifiedtree + + for r in rules: + sc.addRule( r, mycb) + #sc.check(anytree) + + diff --git a/mwlib/scanfile.py b/mwlib/scanfile.py new file mode 100755 index 0000000..96519d2 --- /dev/null +++ b/mwlib/scanfile.py @@ -0,0 +1,29 @@ +#! /usr/bin/env python + +"""used for debugging/testing""" + +import sys +import time +import mwscan + +d=unicode(open(sys.argv[1]).read(), 'utf-8') + +stime=time.time() +r=mwscan.scan(d) +needed = time.time()-stime +for x in r: + print r.repr(x) + +print needed, len(d), len(r) + + + +# stime=time.time() +# r=mwscan.compat_scan(d) +# needed = time.time()-stime + +# print "COMPAT:", needed, len(d), len(r) + + +# #mwscan.dump_tokens(d,r) +# #print needed, len(d), len(r) diff --git a/mwlib/scanner.py b/mwlib/scanner.py new file mode 100755 index 0000000..d4d7167 --- /dev/null +++ b/mwlib/scanner.py @@ -0,0 +1,6 @@ +#! /usr/bin/env python + +if 0: + from plexscanner import TagToken, EndTagToken, tokenize +else: + from mwscan import TagToken, EndTagToken, tokenize diff --git a/mwlib/texmap.py b/mwlib/texmap.py new file mode 100755 index 0000000..f3071e6 --- /dev/null +++ b/mwlib/texmap.py @@ -0,0 +1,95 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import re + +def convertSymbols(latexsource): + def repl(mo): + name=mo.group(0) + return symbolMap.get(name, name) + + latexsource = texcmd.sub(repl, latexsource) + return latexsource + +texcmd = re.compile(r"\\[a-zA-Z]+") + +symbolMap = {'\\Bbb': '\\mathbb', + '\\Complex': '\\mathbb{C}', + '\\Dagger': '\\ddagger', + '\\Darr': '\\Downarrow', + '\\Harr': '\\Leftrightarrow', + '\\Larr': '\\Leftarrow', + '\\Lrarr': '\\Leftrightarrow', + '\\N': '\\mathbb{N}', + '\\O': '\\emptyset', + '\\R': '\\mathbb{R}', + '\\Rarr': '\\Rightarrow', + '\\Reals': '\\mathbb{R}', + '\\Uarr': '\\Uparrow', + '\\Z': '\\mathbb{Z}', + '\\alef': '\\aleph', + '\\alefsym': '\\aleph', + '\\and': '\\land', + '\\ang': '\\angle', + '\\arccos': '\\mathop{\\mathrm{arccos}}', + '\\arccot': '\\mathop{\\mathrm{arccot}}', + '\\arccsc': '\\mathop{\\mathrm{arccsc}}', + '\\arcsec': '\\mathop{\\mathrm{arcsec}}', + '\\bold': '\\mathbf', + '\\bull': '\\bullet', + '\\clubs': '\\clubsuit', + '\\cnums': '\\mathbb{C}', + '\\dArr': '\\Downarrow', + '\\darr': '\\downarrow', + '\\diamonds': '\\diamondsuit', + '\\empty': '\\emptyset', + '\\exist': '\\exists', + '\\ge': '\\geq', + '\\hAar': '\\Leftrightarrow', + '\\harr': '\\leftrightarrow', + '\\hearts': '\\heartsuit', + '\\image': '\\Im', + '\\infin': '\\infty', + '\\isin': '\\in', + '\\lArr': '\\Leftarrow', + '\\lang': '\\langle', + '\\larr': '\\leftarrow', + '\\le': '\\leq', + '\\lrArr': '\\Leftrightarrow', + '\\lrarr': '\\leftrightarrow', + '\\natnums': '\\mathbb{N}', + '\\ne': '\\neq', + '\\or': '\\lor', + '\\part': '\\partial', + '\\plusmn': '\\pm', + '\\rArr': '\\Rightarrow', + '\\rang': '\\rangle', + '\\rarr': '\\rightarrow', + '\\real': '\\Re', + '\\reals': '\\mathbb{R}', + '\\sdot': '\\cdot', + '\\sect': '\\S', + '\\sgn': '\\mathop{\\mathrm{sgn}}', + '\\spades': '\\spadesuit', + '\\sub': '\\subset', + '\\sube': '\\subseteq', + '\\supe': '\\supseteq', + '\\thetasym': '\\vartheta', + '\\uArr': '\\Uparrow', + '\\uarr': '\\uparrow', + '\\weierp': '\\wp', + '\\Alpha': 'A{}', + '\\Beta': 'B{}', + '\\Epsilon': 'E{}', + '\\Zeta': 'Z{}', + '\\Eta': 'H{}', + '\\Iota': 'I{}', + '\\Kappa' : 'K{}', + '\\Mu': 'M{}', + '\\Nu': 'N{}', + '\\Rho': 'P{}', + '\\Tau': 'T{}', + '\\Chi': 'C{}', + } diff --git a/mwlib/timeline.py b/mwlib/timeline.py new file mode 100755 index 0000000..e85dd84 --- /dev/null +++ b/mwlib/timeline.py @@ -0,0 +1,52 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +"""implement http://meta.wikimedia.org/wiki/EasyTimeline +""" + +import os +import tempfile +try: + from hashlib import md5 +except ImportError: + from md5 import md5 + + +def drawTimeline(script, basedir=None): + if isinstance(script, unicode): + script = script.encode('utf8') + if basedir is None: + basedir = os.path.join(tempfile.gettempdir(), "timeline-%s" % (os.getuid(),)) + if not os.path.exists(basedir): + os.mkdir(basedir) + + m=md5() + m.update(script) + ident = m.hexdigest() + + pngfile = os.path.join(basedir, ident+'.png') + + if os.path.exists(pngfile): + return pngfile + + scriptfile = os.path.join(basedir, ident+'.txt') + open(scriptfile, 'w').write(script) + et = os.path.join(os.path.dirname(__file__), "EasyTimeline.pl") + + err = os.system("perl %s -P /usr/bin/ploticus -T /tmp/ -i %s" % (et, scriptfile)) + if err != 0: + return None + + svgfile = os.path.join(basedir, ident+'.svg') + + if os.path.exists(svgfile): + os.unlink(svgfile) + + if os.path.exists(pngfile): + return pngfile + + return None + + diff --git a/mwlib/uparser.py b/mwlib/uparser.py new file mode 100755 index 0000000..8565c74 --- /dev/null +++ b/mwlib/uparser.py @@ -0,0 +1,126 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +"""usable/user parser""" + +from mwlib import parser, scanner, expander + +def simplify(node): + "concatenates textnodes in order to reduce the number of objects" + Text = parser.Text + + last = None + toremove = [] + for i,c in enumerate(node.children): + if c.__class__ == Text: # would isinstance be safe? + if last: + last.caption += c.caption + toremove.append(i) + else: + last = c + else: + simplify(c) + last = None + + for i,ii in enumerate(toremove): + del node.children[ii-i] + +def fixlitags(node): + Text = parser.Text + + if not isinstance(node, parser.ItemList): + idx = 0 + while idx < len(node.children): + if isinstance(node.children[idx], parser.Item): + lst = parser.ItemList() + lst.append(node.children[idx]) + node.children[idx] = lst + idx += 1 + while idx<len(node.children): + if isinstance(node.children[idx], parser.Item): + lst.append(node.children[idx]) + del node.children[idx] + elif node.children[idx]==Text("\n"): + del node.children[idx] + else: + break + else: + idx += 1 + + for x in node.children: + fixlitags(x) + +def removeBoilerplate(node): + i = 0 + while i < len(node.children): + x = node.children[i] + if isinstance(x, parser.TagNode) and x.caption=='div': + try: + klass = x.values.get('class', '') + except AttributeError: + klass = '' + + if 'boilerplate' in klass: + del node.children[i] + continue + + i += 1 + + for x in node.children: + removeBoilerplate(x) + + + + +postprocessors = [removeBoilerplate, simplify, fixlitags] + +def parseString(title=None, raw=None, wikidb=None, revision=None): + """parse article with title from raw mediawiki text""" + assert title is not None + + if raw is None: + raw = wikidb.getRawArticle(title, revision=revision) + assert raw is not None, "cannot get article %r" % (title,) + if wikidb: + te = expander.Expander(raw, pagename=title, wikidb=wikidb) + input = te.expandTemplates() + else: + input = raw + + tokens = scanner.tokenize(input, title) + + a = parser.Parser(tokens, title).parse() + a.caption = title + for x in postprocessors: + x(a) + return a + + +def simpleparse(raw): # !!! USE FOR DEBUGGING ONLY !!! does not use post processors + import sys + from mwlib import dummydb + db = dummydb.DummyDB() + + tokens = scanner.tokenize(raw) + r=parser.Parser(tokens, "unknown").parse() + parser.show(sys.stdout, r, 0) + return r + +def main(): + from mwlib.dummydb import DummyDB + + import os + import sys + + db = DummyDB() + + for x in sys.argv[1:]: + input = unicode(open(x).read(), 'utf8') + title = unicode(os.path.basename(x)) + parseString(title, input, db) + +if __name__=="__main__": + main() + diff --git a/mwlib/utils.py b/mwlib/utils.py new file mode 100644 index 0000000..4fd6b55 --- /dev/null +++ b/mwlib/utils.py @@ -0,0 +1,112 @@ +import os +import sys +import errno +import time + +# provide all for python 2.4 +try: + from __builtin__ import all +except ImportError: + def all(items): + for x in items: + if not x: + return False + return True + +def fsescape(s): + res = [] + for x in s: + c = ord(x) + if c>127: + res.append("~%s~" % c) + elif c==126: # ord("~")==126 + res.append("~~") + else: + res.append(x) + return "".join(res) + +def start_logging(path): + sys.stderr.flush() + sys.stdout.flush() + + f = open(path, "a") + fd = f.fileno() + os.dup2(fd, 1) + os.dup2(fd, 2) + + null=os.open('/dev/null', os.O_RDWR) + os.dup2(null, 0) + os.close(null) + +def daemonize(dev_null=False): + # See http://www.erlenstar.demon.co.uk/unix/faq_toc.html#TOC16 + if os.fork(): # launch child and... + os._exit(0) # kill off parent + os.setsid() + if os.fork(): # launch child and... + os._exit(0) # kill off parent again. + os.umask(077) + if dev_null: + null=os.open('/dev/null', os.O_RDWR) + for i in range(3): + try: + os.dup2(null, i) + except OSError, e: + if e.errno != errno.EBADF: + raise + os.close(null) + +def shell_exec(cmd): + """Execute cmd in a subshell + + @param cmd: command to execute with os.system(), if given as unicode its + converted to str using sys.getfilesystemencoding() + @type cmd: basestring + + @returns: exit code of command + @rtype: int + """ + if isinstance(cmd, unicode): + enc = sys.getfilesystemencoding() + assert enc is not None, 'no filesystem encoding (set LANG)' + cmd = cmd.encode(enc, 'ignore') + return os.system(cmd) + + +def get_multipart(filename, data, name): + """Build data in format multipart/form-data to be used to POST binary data. + + @param filename: filename to be used in multipart request + @type filenaem: basestring + + @param data: binary data to include + @type data: str + + @param name: name to be used in multipart request + @type name: basestring + + @returns: tuple containing content-type and body for the request + @rtype: (str, str) + """ + + if isinstance(filename, unicode): + filename = filename.encode('utf-8', 'ignore') + if isinstance(name, unicode): + name = name.encode('utf-8', 'ignore') + + boundary = "-"*20 + ("%f" % time.time()) + "-"*20 + + items = [] + items.append("--" + boundary) + items.append('Content-Disposition: form-data; name="%(name)s"; filename="%(filename)s"'\ + % {'name': name, 'filename': filename}) + items.append('Content-Type: application/octet-stream') + items.append('') + items.append(data) + items.append('--' + boundary + '--') + items.append('') + + body = "\r\n".join(items) + content_type = 'multipart/form-data; boundary=%s' % boundary + + return content_type, body diff --git a/mwlib/web.py b/mwlib/web.py new file mode 100755 index 0000000..5e32fb8 --- /dev/null +++ b/mwlib/web.py @@ -0,0 +1,122 @@ +#! /usr/bin/env python + +"""simple wsgi app for serving mediawiki content +""" + +import os +import mimetypes +import StringIO +from mwlib import uparser, htmlwriter, rendermath + +class Pngmath(object): + def __init__(self, basedir): + self.basedir = basedir + + def __call__(self, env, start_response): + pi = env['PATH_INFO'] + path = pi.split('/', 2)[-1] + path = path.strip("/") + path = path[:-len(".png")] + + pngfile = os.path.join(self.basedir, path+'.png') + if not os.path.exists(pngfile): + texfile = os.path.join(self.basedir, path+'.tex') + if not os.path.exists(texfile): + start_response('404 Not found', [('Content-Type', 'text/plain')]) + return ["404 not found"] + + r = rendermath.Renderer() + r._render_file(path, 'png') + + + d=open(pngfile, 'rb').read() + + + start_response('200 Ok', [('Content-Type', 'image/png')]) + return [d] + +class Files(object): + def __init__(self, basedir): + self.basedir = basedir + + def __call__(self, env, start_response): + pi = env['PATH_INFO'] + path = pi.split('/', 2)[-1] + path = path.strip("/") + assert ".." not in path, "path must not contain '..'" + + mt, enc = mimetypes.guess_type(path) + + try: + f=open(os.path.join(self.basedir, path), 'rb') + except (IOError, OSError), err: + print "ERROR:", err + start_response('404 Not found', [('Content-Type', 'text/plain')]) + return ["404 not found"] + + send = start_response('200 OK', [('Content-type', mt or 'text/plain; charset=utf-8')]) + while 1: + data=f.read(0x20000) + if not data: + break + send(data) + return [] + + +class Serve(object): + head = """<?xml version="1.0" encoding="UTF-8"?> +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> +<head> +<meta http-equiv="content-type" content="text/html; charset="utf-8"></meta> +<link rel="stylesheet" href="/resources/pedia.css" /> +</head> +<body> +""" + def __init__(self, db, images): + self.db = db + self.images = images + from mwlib import resources + self.resources = Files(os.path.dirname(resources.__file__)) # FIXME + self.image_files = Files(os.path.expanduser("~/images")) # FIXME + self.pngmath = Pngmath(os.path.expanduser("~/pngmath")) # FIXME + self.timeline = Files(os.path.expanduser("~/timeline")) # FIXME + + def show(self, env, start_response): + article = unicode(env['PATH_INFO'], 'utf-8').strip('/').replace("_", " ") + article = article[:1].upper()+article[1:] # FIXME: we should redirect instead. + + raw=self.db.getRawArticle(article) + if not raw: + start_response('404 Not found', [('Content-Type', 'text/plain')]) + return ["Article %r not found" % (article,)] + + send = start_response('200 OK', [('Content-type', 'text/html; charset=utf-8')]) + send(self.head) + + out=StringIO.StringIO(u"") + + a=uparser.parseString(article, raw=raw, wikidb=self.db) + w=htmlwriter.HTMLWriter(out, self.images) + w.write(a) + + return [out.getvalue().encode('utf-8')] + + def __call__(self, env, start_response): + path = env['PATH_INFO'] + + + if path.startswith("/resources/"): + return self.resources(env, start_response) + if path.startswith("/images"): + return self.image_files(env, start_response) + if path.startswith("/pngmath/"): + return self.pngmath(env, start_response) + if path.startswith("/timeline/"): + return self.timeline(env, start_response) + + return self.show(env, start_response) + + + start_response('404 Not found', [('Content-Type', 'text/plain')]) + return ["404 Not found"] diff --git a/mwlib/wiki.py b/mwlib/wiki.py new file mode 100755 index 0000000..96378ed --- /dev/null +++ b/mwlib/wiki.py @@ -0,0 +1,135 @@ +#! /usr/bin/env python + +# Copyright (c) 2007-2008 PediaPress GmbH +# See README.txt for additional licensing information. + +import os +from ConfigParser import ConfigParser + +def wiki_mwapi(base_url=None, license=None, template_blacklist=None): + from mwlib import mwapidb + return mwapidb.WikiDB(base_url, license, template_blacklist) + +def wiki_zip(path=None, url=None, name=None): + from mwlib import zipwiki + return zipwiki.Wiki(path) + +def wiki_net(articleurl=None, url=None, name=None, imagedescriptionurls=None, + templateurls=None, templateblacklist=None, defaultarticlelicense=None, + defaultauthors=None, **kwargs): + from mwlib import netdb + + if templateurls: + templateurls = [x for x in templateurls.split() if x] + else: + raise RuntimeError("templateurls parameter for netdb not set in [wiki] section") + + if imagedescriptionurls: + imagedescriptionurls = [x for x in imagedescriptionurls.split() if x] + else: + raise RuntimeError("imagedescriptionurls parameter for netdb not set in [wiki] section") + + if defaultauthors: + defaultauthors = [a.strip() for a in defaultauthors.split(',')] + + return netdb.NetDB(articleurl, + imagedescriptionurls=imagedescriptionurls, + templateurls=templateurls, + templateblacklist=templateblacklist, + defaultauthors=defaultauthors, + ) + +def wiki_cdb(path=None, **kwargs): + from mwlib import cdbwiki + path = os.path.expanduser(path) + db=cdbwiki.WikiDB(path) + return db + +def image_mwapi(base_url=None, shared_base_url=None): + from mwlib import mwapidb + return mwapidb.ImageDB(base_url, shared_base_url) + +def image_download(url=None, localpath=None, knownlicenses=None): + assert url, "must supply url in [images] section" + from mwlib import netdb + + if localpath: + localpath = os.path.expanduser(localpath) + urls = [x for x in url.split() if x] + assert urls + + if knownlicenses: + knownlicenses = [x for x in knownlicenses.split() if x] + else: + knownlicenses = None + + imgdb = netdb.ImageDB(urls, cachedir=localpath, knownLicenses=knownlicenses) + return imgdb + +def image_zip(path=None): + from mwlib import zipwiki + return zipwiki.ImageDB(path) + + + +dispatch = dict( + images = dict(mwapi=image_mwapi, download=image_download, zip=image_zip), + wiki = dict(mwapi=wiki_mwapi, cdb=wiki_cdb, net=wiki_net, zip=wiki_zip) +) + +def _makewiki(conf): + res = {} + + # yes, I really don't want to type this everytime + wc = os.path.join(conf, "wikiconf.txt") + if os.path.exists(wc): + conf = wc + + if conf.startswith("http://") or conf.startswith("https://"): + res['wiki'] = wiki_mwapi(conf) + res['images'] = image_mwapi(conf) + return res + + + if conf.lower().endswith(".zip"): + from mwlib import zipwiki + res['wiki'] = zipwiki.Wiki(conf) + res['images'] = zipwiki.ImageDB(conf) + return res + + cp=ConfigParser() + + if not cp.read(conf): + raise RuntimeError("could not read config file %r" % (conf,)) + + + for s in ['images', 'wiki']: + if not cp.has_section(s): + continue + + args = dict(cp.items(s)) + if "type" not in args: + raise RuntimeError("section %r does not have key 'type'" % s) + t = args['type'] + del args['type'] + try: + m = dispatch[s][t] + except KeyError: + raise RuntimeError("cannot handle type %r in section %r" % (t, s)) + + res[s] = m(**args) + + assert "wiki" in res + return res + +def makewiki(conf): + res = _makewiki(conf) + + try: + overlaydir = os.environ['MWOVERLAY'] + assert os.path.isdir(overlaydir) + import mwlib.overlay + res['wiki'] = mwlib.overlay.OverlayDB(res['wiki'], overlaydir) + except: + pass + return res diff --git a/mwlib/zipwiki.py b/mwlib/zipwiki.py new file mode 100755 index 0000000..c6893b8 --- /dev/null +++ b/mwlib/zipwiki.py @@ -0,0 +1,167 @@ +#! /usr/bin/env python + +# Copyright (c) 2008, PediaPress GmbH +# See README.txt for additional licensing information. + +import os +import shutil +import simplejson +import tempfile +from zipfile import ZipFile + +from mwlib.metabook import MetaBook +from mwlib import uparser + +class Wiki(object): + def __init__(self, zipfile): + """ + @type zipfile: basestring or ZipFile + """ + + if isinstance(zipfile, ZipFile): + self.zf = zipfile + else: + self.zf = ZipFile(zipfile) + self.metabook = MetaBook() + self.metabook.loadJson(self.zf.read("metabook.json")) + content = simplejson.loads(self.zf.read('content.json')) + self.articles = content['articles'] + self.templates = content['templates'] + + def _getArticle(self, title, revision=None): + try: + article = self.articles[title] + if revision is None or article['revision'] == revision: + return article + except KeyError: + pass + return None + + def getRawArticle(self, title, revision=None): + article = self._getArticle(title, revision=revision) + if article: + return article['content'] + return None + + def getParsedArticle(self, title, revision=None): + raw = self.getRawArticle(title, revision=revision) + if raw is None: + return None + a = uparser.parseString(title=title, raw=raw, wikidb=self) + return a + + def getURL(self, title, revision=None): + article = self._getArticle(title, revision=revision) + if article: + return article['url'] + return None + + def getAuthors(self, title, revision=None): + article = self._getArticle(title, revision=revision) + if article: + return article.get('authors', []) + return None + + def getTemplate(self, name, followRedirects=True): + try: + return self.templates[name]['content'] + except KeyError: + pass + return None + + +class ImageDB(object): + def __init__(self, zipfile, tmpdir=None): + """ + @type zipfile: basestring or ZipFile + """ + + if isinstance(zipfile, ZipFile): + self.zf = zipfile + else: + self.zf = ZipFile(zipfile) + content = simplejson.loads(self.zf.read('content.json')) + self.images = content['images'] + self._tmpdir = tmpdir + self.diskpaths = {} + + @property + def tmpdir(self): + if self._tmpdir is None: + self._tmpdir = unicode(tempfile.mkdtemp()) + return self._tmpdir + + def getDiskPath(self, name, size=None): + try: + return self.diskpaths[name] + except KeyError: + pass + try: + data = self.zf.read('images/%s' % name.replace("'", '-').encode('utf-8')) + except KeyError: # no such file + return None + + try: + ext = '.' + name.rsplit('.', 1)[1] + except IndexError: + ext = '' + if ext.lower() == '.svg': + ext = '.svg.png' + elif ext.lower() == '.gif': + ext = '.gif.png' + res = os.path.join(self.tmpdir, 'image%04d%s' % (len(self.diskpaths), ext)) + self.diskpaths[name] = res + f=open(res, "wb") + f.write(data) + f.close() + return res + + def getLicense(self, name): + try: + return self.images[name]['license'] + except KeyError: + return None + + def getPath(self): + raise NotImplemented('getPath() does not work with zipwiki.ImageDB!') + + def getURL(self, name): + try: + return self.images[name]['url'] + except KeyError: + return None + + def clean(self): + if self._tmpdir: + shutil.rmtree(self._tmpdir, ignore_errors=True) + + + + +class FakeImageDB(ImageDB): + + imagedata = '\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x03 \x00\x00\x01\xe0\x01\x03\x00\x00\x00g\xc9\x9b\xb6\x00\x00\x00\x01sRGB\x00\xae\xce\x1c\xe9\x00\x00\x00\x06PLTE\xff\xff\xff\x00\x00\x00U\xc2\xd3~\x00\x00\x00\tpHYs\x00\x00\x0b\x13\x00\x00\x0b\x13\x01\x00\x9a\x9c\x18\x00\x00\x00EIDATx\xda\xed\xc1\x01\x01\x00\x00\x00\x82 \xff\xafnH@\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00/\x06\xbd`\x00\x01`<5\x84\x00\x00\x00\x00IEND\xaeB`\x82' + + def __init__(self, tmpdir=None): + """ + @type zipfile: basestring or ZipFile + """ + self._tmpdir = tmpdir + + def getDiskPath(self, name, size=None): + res = os.path.join(self.tmpdir, 'blank.png') + if not os.path.exists(res): + open(res, "w").write(self.imagedata) + return res + + def getPath(self): + raise NotImplemented('getPath() does not work with zipwiki.FakeImageDB!') + + def getURL(self, name): + raise NotImplemented('getURL() does not work with zipwiki.FakeImageDB!') + + def getLicense(self, name): + raise NotImplemented('getLicense() does not work with zipwiki.FakeImageDB!') + + + |