#!/usr/bin/perl # Copyright © 2011, 2012, 2013 Petr Písař # This is free software. You may redistribute copies of it under the terms of # the GNU General Public License . # There is NO WARRANTY, to the extent permitted by law. # Changelog: # # Version 9 # - Search for other error message if JSON could not be found. This usually # says the client is not supported (not all URLs serves all formats). # - Improve JSON locator # (http://www.ceskatelevize.cz/ivysilani/1126672097-otazky-vaclava-moravce/). # # Version 8 # - Adapt to changes in SMIL play-list generator on the Czech Television # servers. Effective since 2013-05-24. # - New option `-v' has been added to report retrieved URLs. # # Version 7 # - Print stream data in binaru mode. This fixes premature end-of-file on # DOS-like operating systems. # # Version 6 # - Accept iframe player URL with channel name in the file name # - Send XML-RPC header needed for SMIL generator since 2012-05-09 # # Version 5 # - Support Apple M3U and MPEG2-TS/MPEG-4 format # - Improve HTML parser # - Remove autoStart=false option from iframe player URL to get page with # JSON instead of similar page with iframe again # (http://www.ceskatelevize.cz/porady/10316155327-horizont-ct24/) # # Version 4 # - Support RTMP app with slashes # - Report URI in error messages # # Version 3: # - Output in rtmpdump(1) format if `-d' option is specified # - Do not append playpath to URL if ambigous # # Version 2: # - Output playpath as librtmp option if necessary # (http://www.ct24.cz/vysilani/10099403120-kultura-v-regionech/) # - Perl 5.10 support # - Find JSON via iframe first and fall back to direct JSON # - More general example entry page URL in usage output # - Show content provider error message if exists # use strict; use warnings; our $VERSION = 9; use LWP::UserAgent; use HTTP::Request::Common; use HTTP::Response; use XML::XPath; use URI; use JSON 2.0; use Getopt::Std; use IO::Handle; my $SMIL_GENERATOR = '/ajax/playlistURL.php'; my $SMIL_GENERATOR_NEXT = '/ajax/getSmil.php?url='; my $ENTRY = 'http://www.ceskatelevize.cz/ivysilani/zive/ct24/'; sub usage { return<). If this is the only argument, output list of all available streams in format `STREAM_BITRATE: URL' separated by new line. If STREAM_BITRATE is given, output URL of the stream with given rate only. By default, URL of RTMP/FLV/MPEG-4 stream is printed in librtmp(3) format (space-separated librtmp options may follow the URL and all of them must be passed as one argument to librtmp application). If `-d' option is specified, URL with possible arguments are printed as rtmpdump(1) arguments. Note ampersands are kept literal (this should work in simple subshell substition). If `-A' option is specified, HTTP/MPEG-TS/MPEG-4 video will be retrieved. There exist three levels selected by second option: -l Apple M3U play-list pointing to unbound stream segments will be output. This is handy if you have a player that supports the very special play-list. Specification can be found on . -f The Apple play-list will be processed and URLs of the underlying stream segmented to files will be printed. Due to nature of the play-list, the locators are printed periodically, a new segment a few seconds, possibly in endless loop. Intended workflow is to pipe the URLs to an HTTP client which echoes downloaded segments on standard input of a multimedia player. This is the default level. The bit rate must be specified, if more bit rates are available. -s The stream segments will be retrieved and dumped to standard output as continuous stream. You can pipe it your player. The bit rate must be specified, if more bit rates are available. If `-v' option is specified, additional debugging messages will be printed on standard error output. Version: $VERSION. Copyright © 2011, 2012, 2013 Petr Písař This is free software. You may redistribute copies of it under the terms of the GNU General Public License . There is NO WARRANTY, to the extent permitted by law. EOM } our ($opt_A, $opt_d, $opt_l, $opt_f, $opt_s, $opt_v); getopts('Adlfsv') or die usage; if ($#ARGV < 0 || $#ARGV > 1) { die "Bad invocation\n\n" . usage; } $ENTRY = $ARGV[0]; my $BITRATE = $ARGV[1]; if (!($opt_l || $opt_s)) { $opt_f = 1; } # each that operates on reference to array or hash # Works with perl 5.10.1 too. sub eachref { my $ref = shift; if (ref $ref eq 'HASH') { # Built-in implementation always supports HASH return sub { each %$ref; } } if (eval 'each @$ref' ) { # Built-in Perl 5.12 implementation eval 'return sub { each @$ref; }' } else { # Manual implementation of each ARRAY (needed for Perl < 5.12) my $index = -1; return sub { $index++; if ($index <= $#$ref) { ($index, $$ref[$index]); } else { (); } } } } # Convert nested JSON structure expressed as native hash reference into flat # array of key and value pairs. # E.g. { "x" => [ "y" => "1", "z" => undef ] } # becomes ( "x[0][y]", "1", "x[1][z] => null ). # This is handy when sending nested JSON structure as # application/x-www-form-urlencoded by HTTP::Request::Common. sub flatten { my ($ref, $prefix) = @_; my @output = (); my $doeach = eachref($ref); while (my ($key, $val) = &$doeach) { # TODO: Escape /[[]=]/ my $id = (defined $prefix) ? $prefix . '[' . $key . ']' : $key; if (ref $val eq 'HASH' || ref $val eq 'ARRAY') { push @output, flatten($val, $id); } else { push @output, ($id, $val // 'null'); } } return @output; } # Format RTMP URL for librtmp sub formaturl_librtmp { my ($rtmp, $app, $playpath) = @_; my $stream_url = $rtmp; if ($playpath =~ qr{/} or $app =~ qr{/}) { $stream_url .= ' app=' . $app . ' playpath=' . $playpath; } else { $stream_url .= $app . '/' . $playpath; } } # Format RTMP URL for librtmp sub formaturl_rtmpdump { my ($rtmp, $app, $playpath) = @_; my $stream_url = '--rtmp ' . $rtmp; if ($playpath =~ qr{/} or $app =~ qr{/}) { $stream_url .= ' --app ' . $app . ' --playpath ' . $playpath; } else { $stream_url .= $app . '/' . $playpath; } } # Find first pattern match in HTML page, HTML-unescape it and return it. # Otherwise return undef. sub htmlgrep { my ($html_page, $pattern) = @_; my ($text) = ($html_page =~ $pattern); if (defined $text) { $text =~ s/>/>/g; $text =~ s/</new(shift); s/&/%26/g; return $_; } # Try to get JSON request data from HTML page text. # The page is passed as a first argument. The JSON is located by javascript # function identifier passed as a second argument. # Return the JSON data or undef. # Example: callSOAP({"foo);bar":1}); sub findjson { my ($text, $function) = @_; htmlgrep($text, qr{ # Use possesive quantifiers ++, *+ for performance \Q$function\E \( ( # The JSON structure is a (?: # sequence of quoted strings "(?: [^"\\]++ | \\. )*+" | [^)] # and non-quoted )*+ # non-parentheses. ) # \); }x); } # Try to get setRequestHeader function arguments from JS page text passed as # argument. # Return list (header, value) or undef. sub findrequestheader { local $_ = findjson(shift, 'setRequestHeader'); if (!defined) { return undef; } return (m/'([^']*)', '([^']*)'/); } # Return array of { bitrate => INTEGER, url => URL } found in RTMP SMIL play # list. Arguments is playlist as string, URL of the playlist, # playlist as XML::XPath object and boolean signaling URL format (true for # rtmpdump format, false for librtmp format). sub extract_urls_from_rtmp_smil { my ($smil, $smil_url, $parser, $opt_d) = @_; my $videos = $parser->find('/data/smilRoot/body/switchItem/video[@enabled=true()]'); if ($videos->size <= 0) { die "No videos found in SMIL playlist <" . $smil_url . ">:\n" . $smil . "\n"; } my @bitrate_url_pairs = (); foreach my $video ($videos->get_nodelist) { my $suffix = $video->getAttribute('src'); if (! defined $suffix) { print STDERR q{Missing `video/@src' attribute} . "\n"; next; } my $prefix = $video->getParentNode->getAttribute('base'); if (! defined $suffix) { print STDERR q{Missing `video/../@base' attribute for video } . "`$suffix'\n"; next; } my $bitrate = $video->getAttribute('system-bitrate'); if (! defined $suffix) { print STDERR q{Missing `video/@system-bitrate' attribute for video } . "`$suffix'\n"; next; } # Build stream URL. Because RTMP URL can be ambigous, # applications accept aditional arguments separated by space # (the space must not be URI-encoded). my $stream_url; { my $rtmp = URI->new($prefix); my $app = substr($rtmp->path_query, 1); $rtmp->path('/'); $rtmp->query(undef); my $playpath = URI->new($suffix); if ($opt_d) { $stream_url = formaturl_rtmpdump($rtmp, $app, $playpath); } else { $stream_url = formaturl_librtmp($rtmp, $app, $playpath); } } # Store URL push @bitrate_url_pairs, {'bitrate' => $bitrate, 'url' => $stream_url}; } return @bitrate_url_pairs; } # Print segment URLs or their contnent found in bottom-level Apple M3U # play-list. This function can never return if live stream is served by # a server. # Arguments are URL of the playlist, LWP::UserAgent object, and boolean # signalling content of stream segments should be printed instead of their # URLs. # See . sub iterate_bottom_apple_m3u { my ($m3u_url, $ua, $stream_content) = @_; autoflush STDOUT 1; my $reload = 1; my $last_segment = -1; my $target_duration; while ($reload) { # Get bottom-level Apple MPEG play-list. my $response = $ua->request(GET $m3u_url); $response->is_success or die "Could not get bottom-level Apple M3U play-list from <" . $m3u_url . ">: " . $response->status_line . "\n"; my $retrieved_at = time; my $duration; my $sequence = 0; for (split(/(\r)?\n/, $response->decoded_content)) { if (! defined) { next; } if (/\A#EXT-X-TARGETDURATION:(\d+)/) { $target_duration = $1; next; } if (/\A#EXT-X-MEDIA-SEQUENCE:(\d+)/) { $sequence = $1; next; } if (/\A#EXT-X-ENDLIST\b/) { $reload = 0; next; } if (/\A#EXTINF:(\d+)/) { $duration = $1; next; } if (/\A[^#]/) { if (! defined $duration) { print STDERR 'Stray URL in bottom-level Apple play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; next; } if ($sequence > $last_segment) { my $segment_url = URI->new_abs($_, $m3u_url); if ($stream_content) { # Get segment content. binmode STDOUT; $ua->set_my_handler('response_data', sub { print $_[3]; 1; }, 'm_code' => 2); my $response = $ua->request(GET $segment_url); $response->is_success or die "Could not get stream segment content from <" . $segment_url . ">: " . $response->status_line . "\n"; } else { print $segment_url, "\n"; } $last_segment = $sequence; } $duration = undef; $sequence++; next; } } if (! defined $target_duration || $last_segment == -1) { die 'No target duration or URL found in bottom-level Apple " . "play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; } if ($reload) { my $sleep = $target_duration - (time - $retrieved_at); if ($sleep > 0) { sleep $sleep; } } } } # Return array of { bitrate => INTEGER, url => URL } found in top-level # Apple M3U play-list. Returned URLs are locators of bottom-level Apple M3U # play-list for given bitrate. The play-list specification is on # . # Arguments are URL of the playlist and LWP::UserAgent object. sub extract_urls_from_top_apple_m3u { my ($m3u_url, $ua) = @_; my @bitrate_url_pairs = (); # Get top-level Apple MPEG playlist. my $response = $ua->request(GET $m3u_url); $response->is_success or die "Could not get top-level Apple M3U play-list from <" . $m3u_url . ">: " . $response->status_line . "\n"; my $bitrate; for (split(/(\r)?\n/, $response->decoded_content)) { if (! defined) { next; } if (/\A#EXT-X-STREAM-INF:(?:.*,)?BANDWIDTH=(\d+)/) { $bitrate = $1; next; } if (/\A[^#]/) { if (! defined $bitrate) { print STDERR 'Stray URL in top-level Apple play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; next; } push @bitrate_url_pairs, {'bitrate' => $bitrate, 'url' => URI->new_abs($_, $m3u_url)}; $bitrate = undef; next; } } if ($#bitrate_url_pairs < 0) { die 'No URL found in top-level Apple play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; } return @bitrate_url_pairs; } # Return array of { bitrate => INTEGER, url => URL } found in Apple SMIL play # list. Arguments is playlist as string, URL of the playlist, # playlist as XML::XPath object, and LWP::UserAgent object. sub extract_urls_from_apple_smil { my ($smil, $smil_url, $parser, $ua) = @_; my $videos = $parser->find('/data/smilRoot/body/video'); if ($videos->size <= 0) { die "No videos found in SMIL playlist <" . $smil_url . ">:\n" . $smil . "\n"; } my @bitrate_url_pairs = (); foreach my $video ($videos->get_nodelist) { my $m3u_url = $video->getAttribute('src'); if (! defined $m3u_url) { print STDERR q{Missing `video/@src' attribute} . "\n"; next; } push @bitrate_url_pairs, extract_urls_from_top_apple_m3u($m3u_url, $ua); } return @bitrate_url_pairs; } # Get entry HTML page my $ua = LWP::UserAgent->new; if ($opt_A) { $ua->agent('Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) ' . 'AppleWebKit/531.21.10 (KHTML, like Gecko) ' . 'Version/4.0.4 Mobile/7B314 Safari/531.21.10'); } if ($opt_v) { print STDERR "Getting <$ENTRY>\n"; } my $response = $ua->request(GET $ENTRY); $response->is_success or die "Could not get entry page from <" . $ENTRY . ">: " . $response->status_line . "\n"; my $page = $response->decoded_content; # Try to get iframe player URL # The web page is not well-formed XML, we cannot use XPath # '//html:div[@id="iFramePositionContainer"]/html:iframe/@src' or # '//html:p[@id="iframeHolder"]/html:iframe/@src'; # This is sometimes relative, sometimes absolute path my $iframe_url = htmlgrep($page, qr{src="([^"]*/embed/iFramePlayer(?:[^"]*)\.php[^"]*)"}); if (defined $iframe_url && $iframe_url) { # If it ends with "&autoStart=false", it links to another page with the # same iframe player URL without the parameter. Thus remove the parameter. if ($opt_v) { print STDERR "Iframe URL <$iframe_url> found.\n"; } $iframe_url =~ s/&autoStart=false//; # Get iframe player page $iframe_url = URI->new_abs($iframe_url, $ENTRY); if ($opt_v) { print STDERR "Getting iframe <$iframe_url>\n"; } $response = $ua->request(GET $iframe_url); $response->is_success or die "Could not get iframe player from <" . $iframe_url . ">: " . $response->status_line . "\n"; $page = $response->decoded_content; } # Get JSON request data my $json_data = findjson($page, 'callSOAP'); unless (defined $json_data && $json_data) { # Try to get error message from stream provider my $message = htmlgrep($page, qr{([^<]*)<}); if (defined $message && $message) { die "$message\n"; } $message = htmlgrep($page, qr{(?:<[^>]*>)*([^<]*)<}); if (defined $message && $message) { die "$message\n"; } # else die in general way if ($opt_v) { print STDERR "===BEGIN INPUT===\n$page\n===END INPUT===\n"; } die "Could not find JSON data structure\n"; } # Get XML-RPC header definition my ($xmlrpc_header, $xmlrpc_value) = findrequestheader($page); if (!defined $xmlrpc_header or !defined $xmlrpc_value) { if ($opt_v) { print STDERR "===BEGIN INPUT===\n$page\n===END INPUT===\n"; } print STDERR "XML-RPC header definition not found.\n"; } elsif ($opt_v) { print STDERR "XML-RPC header found: <$xmlrpc_header>\n"; print STDERR "XML-RPC value found: <$xmlrpc_value>\n"; } # Decode JSON request data my $data; eval { $data = decode_json($json_data) } or die "Could not decode JSON string: $json_data: $@\n"; my @data = flatten($data); # Get SMIL playlist URL my $smil_generator_url = URI->new_abs($SMIL_GENERATOR, $ENTRY); $ua->default_header($xmlrpc_header => $xmlrpc_value) if defined $xmlrpc_header; if ($opt_v) { print STDERR "Posting to SMIL URL generator <$smil_generator_url>\n"; print STDERR "Posting data to SMIL URL generator: ", join(',', @data), "\n"; } $response = $ua->request(POST $smil_generator_url, \@data); $response->is_success or die "Could not get SMIL playlist URL from <" . $smil_generator_url . ">: " . $response->status_line . "\n"; my $smil_url = $response->decoded_content; # Rewrite SMIL playlist URL for next-stage generator # Constants are defined in iPlayer.min.js referred from main web page. $smil_url = URI->new_abs($SMIL_GENERATOR_NEXT . $smil_url, $ENTRY); # Get SMIL playlist if ($opt_v) { print STDERR "Getting SMIL playlist <$smil_url>\n"; } $response = $ua->request(GET $smil_url); $ua->default_header($xmlrpc_header => undef) if defined $xmlrpc_header; $response->is_success or die "Could not get SMIL playlist from <" . $smil_url . ">: " . $response->status_line . "\n"; my $smil = $response->decoded_content; # Get stream URLs my $parser = XML::XPath->new('xml' => $smil) or die "Could not parse SMIL playlist from <" . $smil_url . ">:\n" . $smil . "\n"; my @bitrate_url_pairs = (); # XXX: XML::XPath is lazy in parsing. It dies when calling find() on XML syntax # error. eval { if ($opt_A) { @bitrate_url_pairs = extract_urls_from_apple_smil($smil, $smil_url, $parser, $ua); } else { @bitrate_url_pairs = extract_urls_from_rtmp_smil($smil, $smil_url, $parser, $opt_d); } }; if ($@) { print STDERR "Error while finding stream URLs in SMIL playlist from <", $smil_url, ">.\n"; print STDERR "===BEGIN INPUT===\n$smil\n===END INPUT===\n"; die "Reason: $@\n"; } if ($opt_A && ($opt_f || $opt_s)) { # Select Apple M3U URL if (defined $BITRATE) { @bitrate_url_pairs = grep { ${$_}{'bitrate'} == $BITRATE } @bitrate_url_pairs; } if ($#bitrate_url_pairs < 0) { die "No usable video streams found in SMIL playlist:\n$smil\n"; } if ($#bitrate_url_pairs == 0 ) { iterate_bottom_apple_m3u(${$bitrate_url_pairs[0]}{'url'}, $ua, $opt_s); } else { die "Multiple bit-rate play-lists not implemented yet.\n"; } } else { # Output URL my $video_counter = 0; for my $pair (@bitrate_url_pairs) { if (defined $BITRATE) { if (${$pair}{'bitrate'} == $BITRATE) { print "${$pair}{'url'}\n"; $video_counter++; } } else { print "${$pair}{'bitrate'}: ${$pair}{'url'}\n"; $video_counter++; } } if ($video_counter <= 0) { die "No usable video streams found in SMIL playlist:\n$smil\n"; } }