#!/usr/bin/perl # # Convert DMOZ content.rdf.gz data dump into legal RDF # (and optionally delete Adult content) # # $Id: content.perl,v 1.3 2001/07/09 16:42:53 cmdjb Exp $ # # Copyright 2000 Dave Beckett, ILRT, University of Bristol # http://purl.org/net/dajobe/ # # USAGE: # gunzip -d content.rdf # my $delete_adult_content=1; my $in_body=0; # Three values: # 0 - before first Adult topic # 1 - during Adult topics # 2 - afterwards my $in_adult_content=0; # Set this to a topic to just emit that area. my $keep_topic=undef; my $in_keep_content=0; while(<>) { my $topic=undef; $topic=$1 if m%%; if ($delete_adult_content && defined $topic) { if($in_adult_content == 0) { $in_adult_content = 1 if $topic =~ /Adult/; } elsif( $in_adult_content == 1) { if ($topic !~ /Adult/) { $in_adult_content = 2; $delete_adult_content = 0; # optimisation to prevent extra match } } next if ($in_adult_content == 1); } # Keep only topics below $keep_topic if (defined $keep_topic && defined $topic) { if ( ($in_keep_content==0) && ($topic =~ m%^$keep_topic%) ) { $in_keep_content=1; # Found first entry in topic } elsif ( ($in_keep_content==1) && ($topic !~ m%^$keep_topic%) ) { $in_keep_content=2; # Inside topic, about to leave } next if ($in_keep_content==2); # Skip all records not inside keep_topic } # Correct RDF syntax and namespaces s% about=% r:about=%; s%r:id=%r:ID=%; s%rdf"%rdf/"%; s% s%"("/?)>%$1>%g; # Fix broken attribute content - URIs, IDs s%(about|resource|ID)="([^"]+)"%my($attr,$value)=($1,$2); $value =~ s/&/\&/g; $value =~ s//\>/g; qq{$attr="$value"}%e; # Fix broken element content s%()(.+)()%my($start,$content,$end)=($1,$2,$3); $content =~ s/\&/\&/g; $content =~ s//\>/g; $start.$content.$end;%ge; # Remove/quote chars outside expected range # s/([\x00-\x08\x0b-\x0c\x0e-\x1f])/sprintf("\&#%d;",ord $1)/ge; s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//g; s/([\x80-\xff])/sprintf("\&#%d;",ord $1)/ge; print; }