#!/usr/bin/perl # Copyright © 2011, 2012, 2013, 2014, 2015, 2016, 2017 Petr Písař # This is free software. You may redistribute copies of it under the terms of # the GNU General Public License . # There is NO WARRANTY, to the extent permitted by law. # Changelog: # # Version 26 # - Adapt to changes effective since 2017-02-13. # # Version 25 # - Adapt to changes effective since 2016-11-22. # # Version 24 # - Adapt to changes effective since 2015-10-16. # - RTMP mode removed because it's no longer provided by the servers. # - Encode undefined value as empty string to conform latest jQuery # (http://www.ceskatelevize.cz/porady/10316155327-horizont-ct24/215411058051016/). # - If multi-part video is encountered, user has to select one by positional # argument. One can use `-t' option to list the parts. # (http://www.ceskatelevize.cz/ivysilani/11177713200-tour-de-france-2015/215471291092002/). # # Version 23 # - Adapt to changes effective since 2014-12-18. # - Please note that the RTMP streams do not work. # # Version 22 # - Adapt to changes in SMIL URL generator definition as needed since # beginning of December 2014. # - Support ct24 portal links # (http://www.ceskatelevize.cz/ct24/svet/295620-eichler-k-australii-horsi-nez-osamely-vlk-je-zbesily-vlk/) # # Version 21 # - Return iframe searcher as iframes have not yet been eradicted as I thought # (http://www.ceskatelevize.cz/msfotbal/videoarchiv/nejnovejsi/279399-sestrih-utkani-nizozemsko-kostarika/). # - Support Apple M3U playlist URL served as a SMIL play-list. # # Version 20 # - Catch up to major changes on the server side. Not all features are # available yet. # # Version 19 # - Send `type' element not to confuse SMIL playlist genererator as it is # needed since 2014-05-05. # # Version 18 # - Fix locating JSON. The affected RTMP variant got another argument. # # Version 17 # - Fix request for RTMP playlist generator by not sending undefined # streamQuality. # # Version 16 # - Recognize an error message in new format. # - Respect *_proxy environment variables. # # Version 15 # - Implement short path when overloaded server returns SMIL URL directly. # # Version 14 # - Adapt to server changes effective since 2014-02-13. Only RTMP is supported # now. The Apple format is unsupported. # - Require URI::Escape Perl module. # # Version 13 # - Rewrite SMIL playlist URL as performed by web interface since 2014-01-31. # # Version 12 # - Search SMIL playlist generator URL from web pages as the address changes # frequently. # # Version 11 # - Implement change of SMIL playlist generator URL as spotted on 2014-01-10. # Thanks to Jiri B. # - Set output encoding for diganostic messages according to locale as # messages extracted from web pages are in Czech. # # Version 10 # - Implement change of SMIL playlist generator URL as spotted on 2014-01-07. # # Version 9 # - Search for other error message if JSON could not be found. This usually # says the client is not supported (not all URLs serves all formats). # - Improve JSON locator # (http://www.ceskatelevize.cz/ivysilani/1126672097-otazky-vaclava-moravce/). # # Version 8 # - Adapt to changes in SMIL play-list generator on the Czech Television # servers. Effective since 2013-05-24. # - New option `-v' has been added to report retrieved URLs. # # Version 7 # - Print stream data in binaru mode. This fixes premature end-of-file on # DOS-like operating systems. # # Version 6 # - Accept iframe player URL with channel name in the file name # - Send XML-RPC header needed for SMIL generator since 2012-05-09 # # Version 5 # - Support Apple M3U and MPEG2-TS/MPEG-4 format # - Improve HTML parser # - Remove autoStart=false option from iframe player URL to get page with # JSON instead of similar page with iframe again # (http://www.ceskatelevize.cz/porady/10316155327-horizont-ct24/) # # Version 4 # - Support RTMP app with slashes # - Report URI in error messages # # Version 3: # - Output in rtmpdump(1) format if `-d' option is specified # - Do not append playpath to URL if ambigous # # Version 2: # - Output playpath as librtmp option if necessary # - Perl 5.10 support # - Find JSON via iframe first and fall back to direct JSON # - More general example entry page URL in usage output # - Show content provider error message if exists # # Procedure since 2015-10-16 # The hash is at in an in-line # javascript: # # jQuery.extend(Drupal.settings, {[...] # media_ivysilani:{hash:"HEXASTRING"},[...]}); # # Then GET is performed to /ivysilani/embed/iFramePlayerCT24.php?[...] with # hash=HEXASTRING appended to retrieve iframe HTML code. The URL comes from # //span[class="media-ivysilani-placeholder"]/@data-url in the entry point as # computed in # . # # Then XMLHttpRequest POST to /ivysilani/ajax/get-client-playlist with x-addr # HTTP header and playlist[0][type], playlist[0][id], requestUrl, requestSource, # and type input form is performed. # # The x-addr value comes from the PHP page. # The form values comes from # # javascript whose address is in the PHP. # # It returns JSON: # # {"url":"http:\/\/www.ceskatelevize.cz\/ivysilani\/client-playlist\/?key=HEXASTRING2"} # # Then GET is performed to the url value. A JSON is returned: # # { [...], "playlist":[ # {"streamUrls":{"main":"http:\/\/80.188.65.18:80\/cdn\/[...]",[...]} # ] } # # The main streamUrl returns top-level segmented M3U playlist. # # Other code: # # use strict; use warnings; use utf8; use open ':locale'; our $VERSION = 26; use LWP::UserAgent; use HTTP::Request::Common; use HTTP::Response; use XML::XPath; use URI; use URI::Escape; use JSON 2.0; use Getopt::Std; use IO::Handle; my $SMIL_GENERATOR_NEXT = '/ajax/getSmil.php?url='; my $ENTRY = 'http://www.ceskatelevize.cz/ivysilani/zive/ct24/'; sub usage { return<). If this is the only argument and the video has only one part, output list of all available streams in format `BITRATE: URL' separated by new line. If BITRATE is given, output URL of the stream with given rate only. If the video has more parts, you have to specify the part number. The `-A' option must be specified. HTTP/MPEG-TS/MPEG-4 video will be retrieved. There exist four levels selected by option: -t List URLs of top-level Apple M3U playlists for each video part. The output are lines in `PART: URL' format. If the PART was specified, only URL of given video part will be printed. -l Apple M3U play-list pointing to unbound stream segments will be output. This is handy if you have a player that supports the very special play-list. Specification can be found on . -f The Apple play-list will be processed and URLs of the underlying stream segmented to files will be printed. Due to nature of the play-list, the locators are printed periodically, a new segment a few seconds, possibly in endless loop. Intended workflow is to pipe the URLs to an HTTP client which echoes downloaded segments on standard input of a multimedia player. This is the default level. The bit rate must be specified, if more bit rates are available. -s The stream segments will be retrieved and dumped to standard output as continuous stream. You can pipe it your player. The bit rate must be specified, if more bit rates are available. If `-v' option is specified, additional debugging messages will be printed on standard error output. Version: $VERSION. Copyright © 2011, 2012, 2013, 2014, 2015, 2016, 2017 Petr Písař This is free software. You may redistribute copies of it under the terms of the GNU General Public License . There is NO WARRANTY, to the extent permitted by law. EOM } our ($opt_A, $opt_d, $opt_t, $opt_l, $opt_f, $opt_s, $opt_v); getopts('Adtlfsv') or die usage; if ($#ARGV < 0 || $#ARGV > 2) { die "Bad invocation\n\n" . usage; } $ENTRY = shift @ARGV; if (!($opt_l || $opt_s)) { $opt_f = 1; } # each that operates on reference to array or hash # Works with perl 5.10.1 too. sub eachref { my $ref = shift; if (ref $ref eq 'HASH') { # Built-in implementation always supports HASH return sub { each %$ref; } } if (eval 'each @$ref' ) { # Built-in Perl 5.12 implementation eval 'return sub { each @$ref; }' } else { # Manual implementation of each ARRAY (needed for Perl < 5.12) my $index = -1; return sub { $index++; if ($index <= $#$ref) { ($index, $$ref[$index]); } else { (); } } } } # Convert nested JSON structure expressed as native hash reference into flat # array of key and value pairs. # E.g. { "x" => [ "y" => "1", "z" => undef ] } # becomes ( "x[0][y]", "1", "x[1][z] => '' ). # XXX: Since fixing , undefined value is # expressed as empty string instead of 'null' string. # This is handy when sending nested JSON structure as # application/x-www-form-urlencoded by HTTP::Request::Common. sub flatten { my ($ref, $prefix) = @_; my @output = (); my $doeach = eachref($ref); while (my ($key, $val) = &$doeach) { # TODO: Escape /[[]=]/ my $id = (defined $prefix) ? $prefix . '[' . $key . ']' : $key; if (ref $val eq 'HASH' || ref $val eq 'ARRAY') { push @output, flatten($val, $id); } else { push @output, ($id, $val // ''); } } return @output; } # Format RTMP URL for librtmp sub formaturl_librtmp { my ($rtmp, $app, $playpath) = @_; my $stream_url = $rtmp; if ($playpath =~ qr{/} or $app =~ qr{/}) { $stream_url .= ' app=' . $app . ' playpath=' . $playpath; } else { $stream_url .= $app . '/' . $playpath; } } # Format RTMP URL for librtmp sub formaturl_rtmpdump { my ($rtmp, $app, $playpath) = @_; my $stream_url = '--rtmp ' . $rtmp; if ($playpath =~ qr{/} or $app =~ qr{/}) { $stream_url .= ' --app ' . $app . ' --playpath ' . $playpath; } else { $stream_url .= $app . '/' . $playpath; } } # Find first pattern match in HTML page, HTML-unescape it and return it. # Otherwise return undef. sub htmlgrep { my ($html_page, $pattern) = @_; my ($text) = ($html_page =~ $pattern); if (defined $text) { $text =~ s/>/>/g; $text =~ s/</new(shift); s/&/%26/g; return $_; } # Try to get JSON request data from HTML page text. # The page is passed as a first argument. The JSON is located by javascript # function identifier passed as a second argument. # Return the JSON data or undef. # Example: callSOAP({"foo);bar":1}); sub findjson { my ($text, $function) = @_; htmlgrep($text, qr{ # Use possesive quantifiers ++, *+ for performance \Q$function\E \( ( # The JSON structure is a (?: # sequence of quoted strings "(?: [^"\\]++ | \\. )*+" | [^)] # and non-quoted )*+ # non-parentheses. ) # \); }x); } # Try to get setRequestHeader function arguments from JS page text passed as # argument. # Return list (header, value) or undef. sub findrequestheader { local $_ = findjson(shift, 'setRequestHeader'); if (!defined) { return undef; } return (m/'([^']*)', '([^']*)'/); } # Return URL of the only video/@src in the SMIL playlist. # Arguments is playlist as string, URL of the playlist, and # playlist as XML::XPath object. sub extract_url_from_m3u_smil { my ($smil, $smil_url, $parser) = @_; my $videos = $parser->find('/data/smilRoot/body/video'); if ($videos->size <= 0) { die "No videos found in SMIL playlist <" . $smil_url . ">:\n" . $smil . "\n"; } my @bitrate_url_pairs = (); foreach my $video ($videos->get_nodelist) { my $suffix = $video->getAttribute('src'); if (! defined $suffix) { print STDERR q{Missing `video/@src' attribute} . "\n"; next; } # Build stream URL. This is now an HTTP URL. return $suffix; } die "No video URL found.\n"; } # Return array of { bitrate => INTEGER, url => URL } found in RTMP SMIL play # list. Arguments is playlist as string, URL of the playlist, # playlist as XML::XPath object and boolean signaling URL format (true for # rtmpdump format, false for librtmp format). sub extract_urls_from_rtmp_smil { my ($smil, $smil_url, $parser, $opt_d) = @_; my $videos = $parser->find('/data/smilRoot/body/switchItem/video[@enabled=true()]'); if ($videos->size <= 0) { die "No videos found in SMIL playlist <" . $smil_url . ">:\n" . $smil . "\n"; } my @bitrate_url_pairs = (); foreach my $video ($videos->get_nodelist) { my $suffix = $video->getAttribute('src'); if (! defined $suffix) { print STDERR q{Missing `video/@src' attribute} . "\n"; next; } my $prefix = $video->getParentNode->getAttribute('base'); if (! defined $suffix) { print STDERR q{Missing `video/../@base' attribute for video } . "`$suffix'\n"; next; } my $bitrate = $video->getAttribute('system-bitrate'); if (! defined $suffix) { print STDERR q{Missing `video/@system-bitrate' attribute for video } . "`$suffix'\n"; next; } # Build stream URL. Because RTMP URL can be ambigous, # applications accept aditional arguments separated by space # (the space must not be URI-encoded). my $stream_url; { my $rtmp = URI->new($prefix); my $app = substr($rtmp->path_query, 1); $rtmp->path('/'); $rtmp->query(undef); my $playpath = URI->new($suffix); if ($opt_d) { $stream_url = formaturl_rtmpdump($rtmp, $app, $playpath); } else { $stream_url = formaturl_librtmp($rtmp, $app, $playpath); } } ## Build stream URL. This is now an HTTP URL. #my $stream_url = URI->new_abs($suffix, $prefix); # Store URL push @bitrate_url_pairs, {'bitrate' => $bitrate, 'url' => $stream_url}; } return @bitrate_url_pairs; } # Print segment URLs or their content found in bottom-level Apple M3U # play-list. This function can never return if live stream is served by # a server. # Arguments are URL of the playlist, LWP::UserAgent object, and boolean # signalling content of stream segments should be printed instead of their # URLs. # See . sub iterate_bottom_apple_m3u { my ($m3u_url, $ua, $stream_content) = @_; autoflush STDOUT 1; my $reload = 1; my $last_segment = -1; my $target_duration; while ($reload) { # Get bottom-level Apple MPEG play-list. my $response = $ua->request(GET $m3u_url); $response->is_success or die "Could not get bottom-level Apple M3U play-list from <" . $m3u_url . ">: " . $response->status_line . "\n"; my $retrieved_at = time; my $duration; my $sequence = 0; for (split(/(\r)?\n/, $response->decoded_content)) { if (! defined) { next; } if (/\A#EXT-X-TARGETDURATION:(\d+)/) { $target_duration = $1; next; } if (/\A#EXT-X-MEDIA-SEQUENCE:(\d+)/) { $sequence = $1; next; } if (/\A#EXT-X-ENDLIST\b/) { $reload = 0; next; } if (/\A#EXTINF:(\d+)/) { $duration = $1; next; } if (/\A[^#]/) { if (! defined $duration) { print STDERR 'Stray URL in bottom-level Apple play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; next; } if ($sequence > $last_segment) { my $segment_url = URI->new_abs($_, $m3u_url); if ($stream_content) { # Get segment content. binmode STDOUT; $ua->set_my_handler('response_data', sub { print $_[3]; 1; }, 'm_code' => 2); my $response = $ua->request(GET $segment_url); $response->is_success or die "Could not get stream segment content from <" . $segment_url . ">: " . $response->status_line . "\n"; } else { print $segment_url, "\n"; } $last_segment = $sequence; } $duration = undef; $sequence++; next; } } if (! defined $target_duration || $last_segment == -1) { die 'No target duration or URL found in bottom-level Apple " . "play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; } if ($reload) { my $sleep = $target_duration - (time - $retrieved_at); if ($sleep > 0) { sleep $sleep; } } } } # Return array of { bitrate => INTEGER, url => URL } found in top-level # Apple M3U play-list. Returned URLs are locators of bottom-level Apple M3U # play-list for given bitrate. The play-list specification is on # . # Arguments are URL of the playlist and LWP::UserAgent object. sub extract_urls_from_top_apple_m3u { my ($m3u_url, $ua) = @_; my @bitrate_url_pairs = (); # Get top-level Apple MPEG playlist. my $response = $ua->request(GET $m3u_url); $response->is_success or die "Could not get top-level Apple M3U play-list from <" . $m3u_url . ">: " . $response->status_line . "\n"; my $bitrate; for (split(/(\r)?\n/, $response->decoded_content)) { if (! defined) { next; } if (/\A#EXT-X-STREAM-INF:(?:.*,)?BANDWIDTH=(\d+)/) { $bitrate = $1; next; } if (/\A[^#]/) { if (! defined $bitrate) { print STDERR 'Stray URL in top-level Apple play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; next; } push @bitrate_url_pairs, {'bitrate' => $bitrate, 'url' => URI->new_abs($_, $m3u_url)}; $bitrate = undef; next; } } if ($#bitrate_url_pairs < 0) { die 'No URL found in top-level Apple play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; } return @bitrate_url_pairs; } # Return array of { bitrate => INTEGER, url => URL } found in Apple SMIL play # list. Arguments is playlist as string, URL of the playlist, # playlist as XML::XPath object, and LWP::UserAgent object. sub extract_urls_from_apple_smil { my ($smil, $smil_url, $parser, $ua) = @_; my $videos = $parser->find('/data/smilRoot/body/video'); if ($videos->size <= 0) { die "No videos found in SMIL playlist <" . $smil_url . ">:\n" . $smil . "\n"; } my @bitrate_url_pairs = (); foreach my $video ($videos->get_nodelist) { my $m3u_url = $video->getAttribute('src'); if (! defined $m3u_url) { print STDERR q{Missing `video/@src' attribute} . "\n"; next; } push @bitrate_url_pairs, extract_urls_from_top_apple_m3u($m3u_url, $ua); } return @bitrate_url_pairs; } # Return array of { bitrate => INTEGER, url => URL } found in # M3U play-list. Returned URLs are media URLs, bit-rates are dummy numbers. # Arguments are URL of the playlist and LWP::UserAgent object. sub extract_urls_from_m3u { my ($m3u_url, $ua) = @_; my @bitrate_url_pairs = (); # Get the playlist. my $response = $ua->request(GET $m3u_url); $response->is_success or die "Could not M3U play-list from <" . $m3u_url . ">: " . $response->status_line . "\n"; my $order = -1; my $separated; for (split(/(\r)?\n/, $response->decoded_content)) { if (! defined) { next; } if (/\A#EXTINF:/) { $order++; $separated = 1; next; } if (/\A[^#]/) { if (! defined $separated) { print STDERR 'Stray URL in M3U play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; next; } push @bitrate_url_pairs, {'bitrate' => $order, 'url' => URI->new_abs($_, $m3u_url)}; $separated = undef; next; } } if ($#bitrate_url_pairs < 0) { die 'No URL found in M3U play-list from <' . $m3u_url . ">:\n" . $response->decoded_content . "\n"; } return @bitrate_url_pairs; } if (!$opt_A) { die "Non-apple mode not yet supported\n"; } # Get entry HTML page my $ua = LWP::UserAgent->new; $ua->env_proxy(); if ($opt_A) { $ua->agent('Mozilla/5.0(iPad; U; CPU iPhone OS 3_2 like Mac OS X; en-us) ' . 'AppleWebKit/531.21.10 (KHTML, like Gecko) ' . 'Version/4.0.4 Mobile/7B314 Safari/531.21.10'); } else { # ct24 portal does not support default user agent $ua->agent('Mozilla/5.0 (X11; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0'); } if ($opt_v) { print STDERR "Getting <$ENTRY>\n"; } my $response = $ua->request(GET $ENTRY); $response->is_success or die "Could not get entry page from <" . $ENTRY . ">: " . $response->status_line . "\n"; my $page = $response->decoded_content; my $m3u_url; my $smil_url; my $smil_url_is_m3u; # Try to get iframe player # embeds the player within an iframe whose # URL requires a separeted hash code. # embeds the player including the hash # code. # has the player in the main # page directly. # The web page is not well-formed XML, we cannot use XPath # '//span[class="media-ivysilani-placeholder"]/@data-url' or # '//iframe/@src'. my $iframe_url = htmlgrep($page, qr{(?:data-url|src)="([^"]*/embed/iFramePlayer(?:[^"]*)\.php[^"]*)"}); if (defined $iframe_url) { $iframe_url = URI->new_abs($iframe_url, $ENTRY); if ($opt_v) { print STDERR "Iframe player URL <$iframe_url> found.\n"; } # Try to append separeted hash code my $hash_code = htmlgrep($page, qr{\bhash:"([^"]+)"}); if (defined $hash_code and $hash_code ne '') { if ($opt_v) { print STDERR "Hash code $hash_code found.\n"; } $iframe_url->query_form($iframe_url->query_form, 'hash', $hash_code); } else { if ($opt_v) { print STDERR "No separeted hash code found, assuming it's not needed.\n"; } } # Get iframe player page if ($opt_v) { print STDERR "Getting iframe <$iframe_url>\n"; } $response = $ua->request(GET $iframe_url); $response->is_success or die "Could not get iframe player from <" . $iframe_url . ">: " . $response->status_line . "\n"; $page = $response->decoded_content; } else { $iframe_url = $ENTRY; if ($opt_v) { print STDERR "No iframe player found, assuming in-line player.\n"; } } # Get AJAX request data my $ajax_data = findjson($page, 'getPlaylistUrl'); unless (defined $ajax_data && $ajax_data) { # Overloaded server returns SMIL URL directly. Check it here before # printing error message. $smil_url = htmlgrep($page, qr{(?(?:\s*<[^/>]*>)*(.*?)(?:}{\n}g; if ($message) { die "$message\n"; } } # else die in general way if ($opt_v) { print STDERR "===BEGIN INPUT===\n$page\n===END INPUT===\n"; } die "Could not find AJAX data structure\n"; } # The JSON structure is the first argument of getPlaylistUrl() which has # variadic number of arguments. $ajax_data =~ s/(?<=[{\]])(,[^,]*)*\z//; # Build 'data' AJAX structure from AJAX script and iframe sources. # 'requestSource' and 'type' elements are constants defined in the iframe. # 'streamQuality' element is parsed from $ENTRY URL query segment, but it is # not used now. Moreover it can be undefined and then jQuery will not # transport the key with undefined value. # 'addCommercials' element is defined only sometimes in the iframe source and # not needed now. # XXX: Server checks for unexpected keys or values and bails out with 500. Do # not send undefined streamQuality or addCommercials. my $requestUrl = URI->new($iframe_url)->path; my $json_data = qq( { "playlist" : $ajax_data, "requestUrl" : "$requestUrl", "requestSource" : "iVysilani", "type" : "html" } ); #"type" : "flash" # Get XML-RPC header definition my ($xmlrpc_header, $xmlrpc_value) = findrequestheader($page); if (!defined $xmlrpc_header or !defined $xmlrpc_value) { if ($opt_v) { print STDERR "===BEGIN INPUT===\n$page\n===END INPUT===\n"; } print STDERR "XML-RPC header definition not found.\n"; } elsif ($opt_v) { print STDERR "XML-RPC header found: <$xmlrpc_header>\n"; print STDERR "XML-RPC value found: <$xmlrpc_value>\n"; } # Decode JSON request data my $data; eval { $data = decode_json($json_data) } or die "Could not decode JSON string: $json_data: $@\n"; my @data = flatten($data); # Get callSOAP source URL my $script_url = htmlgrep($page, qr{