use File::Find; use Data::Dumper; die "usage: html2db.pl dirname" unless ( $ARGV[0] ); ## MT の import ファイルを import.txt に書き出し。MT の import 機能使って取り込んでね open my $oh, '>', 'import.txt' or die $!; my %datahash; my $firstflg = 0; my @tags; find( { wanted => \&filter, no_chdir => 1 }, $ARGV[0] ); close $oh; ## tag sql データ生成 を別ファイルに書き出し。コレを元に手動でタグを入力してね open $oh, '>', 'tags.txt' or die $!; print $oh Dumper( [@tags] ); close $oh; ## ----------------------------------------------------------------------------- ## 拡張子が html のファイルを検索してパースします ## ----------------------------------------------------------------------------- sub filter { my $name; my $dir; eval { $name = $File::Find::name; $dir = $File::Find::dir; return if ( $name !~ /^(.+\.html)$/ ); my $finename = $1; &parser($name); }; if ($@) { warn "$name : $@ \n"; } } ## ----------------------------------------------------------------------------- ## html パーサー。各自のテンプレにあわせて正規表現変えてくんろ ## ----------------------------------------------------------------------------- sub parser() { my $filename = shift; warn $filename; my $MTEntryTitle; my $MTCategory; my $MTEntryBody; my $MTEntryMore; ## 残念ながら MTEntryBody に吸収されちゃう。 my $MTEntryDate; my $MTEntryAuthor; my @trackbacks; my @comments; open my $fh, '<', $filename or die $!; my $html = do{ local $/; <$fh> }; ## title 復旧。RDF から抽出してみる if ( $html =~ m!dc:title="(.+?)"!ms ) { $MTEntryTitle = $1; } ## body 復旧。 if ( $html =~ m!\n\s*(.+?)\n\s*[\r\n\s]+?!ms ) { $MTEntryBody = $1; } ## category 復旧。 if ( $html =~ m!class="entry-navi-header">Categories:.+?(.+?)!ms ) { $MTCategory = $1; ## entrydate 復旧。 if ( $html =~ m!class="entry-footer">.+?class="post-footers">\s*([^\|]+?)\s*\|.+?(\d\d)/(\d\d)/(\d\d).+?(\d+):(\d+)!ms ) { my $time = $5; my $time2 = sprintf( "%02d", $6 ); if ( $time > 12 ) { $time = sprintf( "%02d", $time - 12 ); $MTEntryDate = qq{$3/$4/20$2 $time:$time2:00 PM}; } else { $time = sprintf( "%02d", $time ); $MTEntryDate = qq{$3/$4/20$2 $time:$time2:00 AM}; } ## 日付が解析できなければ、2007/01/01 00:00:00 で初期化。手抜き。 $MTEntryAuthor = $1; } ## tag 復旧。便宜上100個までしか復旧しない for ( 0 .. 100 ) { if ( $html =~ m!rel="tag">(.+?)!msg ) { push @tags, { title => $MTEntryTitle, tag => $1 }; } else { last; } } } ## comment 復旧。便宜上100個までしか復旧しない。名前、書き込み日時、内容のみ復旧。後は手抜き for ( 0 .. 100 ) { if ( $html =~ m!class="comment"\sid="comment-(\d+)">.+?class="comment-content">\n\s*(.+?)\n\s*[\r\n\s]+?\s*([^\|]+?)\s*\|.+?(\d\d)/(\d+)/(\d+).+?(\d\d):(\d\d)!msg ) { my $date = qq{$5/$6/20$4 $7:$8:00}; my $time = $7; my $time2 = sprintf( "%02d", $8 ); if ( $time > 12 ) { $time = sprintf( "%02d", $time - 12 ); $date = qq{$5/$6/20$4 $time:$time2:00 PM}; } else { $time = sprintf( "%02d", $time ); $date = qq{$5/$6/20$4 $time:$time2:00 AM}; } push @comments, { id => $1, body => $2, auther => $3, date => $date }; } else { last; } } ## trackbacks 復旧。便宜上100個までしか復旧しない。タイトル、URL,書き込み日時のみ復旧。後は手抜き for ( 0 .. 100 ) { if ( $html =~ m!class="trackback-list-item">(.+?)!msg ) { my $date = qq{10/07/2007 07:00:00 PM}; push @trackbacks, { url => $1, title => $2, date => $date }; } else { last; } } ## コメントのデータ生成。もうホント場当たりなスクリプト。 my $commnum = scalar @comments; my $pingnum = scalar @trackbacks; my $commstr = ''; my $pingstr = ''; for (@comments) { $commstr .= qq{----- COMMENT: AUTHOR: $_->{auther} EMAIL: IP: 127.0.0.1 URL: DATE: $_->{date} $_->{body} }; } if ( $commstr eq '' ) { $commstr = qq{----- COMMENT: }; } ## trackbacksのデータ生成。もうホント場当たりなスクリプト。 for (@trackbacks) { $pingstr .= qq{----- PING: TITLE: $_->{title} URL: $_->{url} IP: 127.0.0.1 BLOG NAME: ... orz DATE: $_->{date} oh! no!! ... orz }; } if ( $pingstr eq '' ) { $pingstr = qq{----- PING: }; } ## export データ生成前に、もし同じ日時、同じタイトルなら無視することにしてみる $MTEntryDate = qq{01/01/2007 0:00:00 AM} if ( $MTEntryDate eq '' ); $MTEntryBody = qq{... orz} if ( $MTEntryBody eq '' ); return if ( $datahash{"$MTEntryTitle:$MTEntryDate"} ); $datahash{"$MTEntryTitle:$MTEntryDate"} = 1; ## export データ生成。 print $oh qq{\n--------\n} if ( $firstflg ); $firstflg = 1; print $oh qq{AUTHOR: $MTEntryAuthor TITLE: $MTEntryTitle STATUS: Publish ALLOW COMMENTS: 1 CONVERT BREAKS: __default__ ALLOW PINGS: 1 PRIMARY CATEGORY: $MTCategory CATEGORY: $MTCategory DATE: $MTEntryDate ----- BODY: $MTEntryBody ----- EXTENDED BODY: ----- EXCERPT: ----- KEYWORDS: $commstr $pingstr ----- }; } 1; __END__