2025-12-29 17:18:29 -06:00
14 changed files with 149 additions and 314 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -38,10 +38,11 @@ jobs:
      matrix:
        stable: [true]
        crystal:
          - 1.4.1
          - 1.5.1
          - 1.6.2
          - 1.7.3
          - 1.8.2
          - 1.9.2
        include:
          - crystal: nightly
            stable: false
@ -52,7 +53,7 @@ jobs:
          submodules: true
      - name: Install Crystal
-        uses: crystal-lang/install-crystal@v1.8.0
+        uses: crystal-lang/install-crystal@v1.7.0
        with:
          crystal: ${{ matrix.crystal }}
--- a/.github/workflows/container-release.yml
+++ b/.github/workflows/container-release.yml
@ -25,9 +25,9 @@ jobs:
        uses: actions/checkout@v3
      - name: Install Crystal
-        uses: crystal-lang/install-crystal@v1.8.0
+        uses: crystal-lang/install-crystal@v1.6.0
        with:
-          crystal: 1.9.2
+          crystal: 1.5.0
      - name: Run lint
        run: |
@ -77,3 +77,4 @@ jobs:
          tags: quay.io/invidious/invidious:${{ github.sha }}-arm64,quay.io/invidious/invidious:latest-arm64
          build-args: |
            "release=1"
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -14,7 +14,7 @@ jobs:
      with:
        repo-token: ${{ secrets.GITHUB_TOKEN }}
        days-before-stale: 365
-        days-before-pr-stale: 90 
+        days-before-pr-stale: 45 # PRs should be active. Anything that hasn't had activity in more than 45 days should be considered abandoned.
        days-before-close: 30
        exempt-pr-labels: blocked
        stale-issue-message: 'This issue has been automatically marked as stale and will be closed in 30 days because it has not had recent activity and is much likely outdated. If you think this issue is still relevant and applicable, you just have to post a comment and it will be unmarked.'
--- a/config/config.example.yml
+++ b/config/config.example.yml
@ -161,19 +161,6 @@ https_only: false
 #force_resolve:
 ##
 ## Use Innertube's transcripts API instead of timedtext for closed captions
 ##
 ## Useful for larger instances as InnerTube is **not ratelimited**. See https://github.com/iv-org/invidious/issues/2567
 ##
 ## Subtitle experience may differ slightly on Invidious.
 ##
 ## Accepted values: true, false
 ## Default: false
 ##
 # use_innertube_for_captions: false
 # -----------------------------
 #  Logging
 # -----------------------------
--- a/src/invidious/config.cr
+++ b/src/invidious/config.cr
@ -127,9 +127,6 @@ class Config
  # Pool size for HTTP requests to youtube.com and ytimg.com (each domain has a separate pool of `pool_size`)
  property pool_size : Int32 = 100
  # Use Innertube's transcripts API instead of timedtext for closed captions
  property use_innertube_for_captions : Bool = false
  # Saved cookies in "name1=value1; name2=value2..." format
  @[YAML::Field(converter: Preferences::StringToCookies)]
  property cookies : HTTP::Cookies = HTTP::Cookies.new
--- a/src/invidious/frontend/watch_page.cr
+++ b/src/invidious/frontend/watch_page.cr
@ -7,7 +7,7 @@ module Invidious::Frontend::WatchPage
    getter full_videos : Array(Hash(String, JSON::Any))
    getter video_streams : Array(Hash(String, JSON::Any))
    getter audio_streams : Array(Hash(String, JSON::Any))
-    getter captions : Array(Invidious::Videos::Captions::Metadata)
+    getter captions : Array(Invidious::Videos::Caption)
    def initialize(
      @full_videos,
--- a/src/invidious/playlists.cr
+++ b/src/invidious/playlists.cr
@ -89,7 +89,6 @@ struct Playlist
  property views : Int64
  property updated : Time
  property thumbnail : String?
  property subtitle : String?
  def to_json(offset, json : JSON::Builder, video_id : String? = nil)
    json.object do
@ -101,7 +100,6 @@ struct Playlist
      json.field "author", self.author
      json.field "authorId", self.ucid
      json.field "authorUrl", "/channel/#{self.ucid}"
      json.field "subtitle", self.subtitle
      json.field "authorThumbnails" do
        json.array do
@ -358,8 +356,6 @@ def fetch_playlist(plid : String)
  updated = Time.utc
  video_count = 0
  subtitle = extract_text(initial_data.dig?("header", "playlistHeaderRenderer", "subtitle"))
  playlist_info["stats"]?.try &.as_a.each do |stat|
    text = stat["runs"]?.try &.as_a.map(&.["text"].as_s).join("") || stat["simpleText"]?.try &.as_s
    next if !text
@ -401,7 +397,6 @@ def fetch_playlist(plid : String)
    views:            views,
    updated:          updated,
    thumbnail:        thumbnail,
    subtitle:         subtitle,
  })
 end
--- a/src/invidious/routes/api/v1/videos.cr
+++ b/src/invidious/routes/api/v1/videos.cr
@ -87,78 +87,70 @@ module Invidious::Routes::API::V1::Videos
      caption = caption[0]
    end
-    if CONFIG.use_innertube_for_captions
+    url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
      params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
      initial_data = YoutubeAPI.get_transcript(params)
-      webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
+    # Auto-generated captions often have cues that aren't aligned properly with the video,
-    else
+    # as well as some other markup that makes it cumbersome, so we try to fix that here
-      # Timedtext API handling
+    if caption.name.includes? "auto-generated"
-      url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
+      caption_xml = YT_POOL.client &.get(url).body
-      # Auto-generated captions often have cues that aren't aligned properly with the video,
+      if caption_xml.starts_with?("<?xml")
-      # as well as some other markup that makes it cumbersome, so we try to fix that here
+        webvtt = caption.timedtext_to_vtt(caption_xml, tlang)
-      if caption.name.includes? "auto-generated"
+      else
-        caption_xml = YT_POOL.client &.get(url).body
+        caption_xml = XML.parse(caption_xml)
-        if caption_xml.starts_with?("<?xml")
+        webvtt = String.build do |str|
-          webvtt = caption.timedtext_to_vtt(caption_xml, tlang)
+          str << <<-END_VTT
-        else
+          WEBVTT
-          caption_xml = XML.parse(caption_xml)
+          Kind: captions
-
+          Language: #{tlang || caption.language_code}
          webvtt = String.build do |str|
            str << <<-END_VTT
            WEBVTT
            Kind: captions
            Language: #{tlang || caption.language_code}
-            END_VTT
+          END_VTT
-            caption_nodes = caption_xml.xpath_nodes("//transcript/text")
+          caption_nodes = caption_xml.xpath_nodes("//transcript/text")
-            caption_nodes.each_with_index do |node, i|
+          caption_nodes.each_with_index do |node, i|
-              start_time = node["start"].to_f.seconds
+            start_time = node["start"].to_f.seconds
-              duration = node["dur"]?.try &.to_f.seconds
+            duration = node["dur"]?.try &.to_f.seconds
-              duration ||= start_time
+            duration ||= start_time
-              if caption_nodes.size > i + 1
+            if caption_nodes.size > i + 1
-                end_time = caption_nodes[i + 1]["start"].to_f.seconds
+              end_time = caption_nodes[i + 1]["start"].to_f.seconds
-              else
+            else
-                end_time = start_time + duration
+              end_time = start_time + duration
              end
              start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}"
              end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}"
              text = HTML.unescape(node.content)
              text = text.gsub(/<font color="#[a-fA-F0-9]{6}">/, "")
              text = text.gsub(/<\/font>/, "")
              if md = text.match(/(?<name>.*) : (?<text>.*)/)
                text = "<v #{md["name"]}>#{md["text"]}</v>"
              end
              str << <<-END_CUE
              #{start_time} --> #{end_time}
              #{text}
              END_CUE
            end
            start_time = "#{start_time.hours.to_s.rjust(2, '0')}:#{start_time.minutes.to_s.rjust(2, '0')}:#{start_time.seconds.to_s.rjust(2, '0')}.#{start_time.milliseconds.to_s.rjust(3, '0')}"
            end_time = "#{end_time.hours.to_s.rjust(2, '0')}:#{end_time.minutes.to_s.rjust(2, '0')}:#{end_time.seconds.to_s.rjust(2, '0')}.#{end_time.milliseconds.to_s.rjust(3, '0')}"
            text = HTML.unescape(node.content)
            text = text.gsub(/<font color="#[a-fA-F0-9]{6}">/, "")
            text = text.gsub(/<\/font>/, "")
            if md = text.match(/(?<name>.*) : (?<text>.*)/)
              text = "<v #{md["name"]}>#{md["text"]}</v>"
            end
            str << <<-END_CUE
            #{start_time} --> #{end_time}
            #{text}
            END_CUE
          end
        end
      end
    else
      # Some captions have "align:[start/end]" and "position:[num]%"
      # attributes. Those are causing issues with VideoJS, which is unable
      # to properly align the captions on the video, so we remove them.
      #
      # See: https://github.com/iv-org/invidious/issues/2391
      webvtt = YT_POOL.client &.get("#{url}&format=vtt").body
      if webvtt.starts_with?("<?xml")
        webvtt = caption.timedtext_to_vtt(webvtt)
      else
        # Some captions have "align:[start/end]" and "position:[num]%"
        # attributes. Those are causing issues with VideoJS, which is unable
        # to properly align the captions on the video, so we remove them.
        #
        # See: https://github.com/iv-org/invidious/issues/2391
        webvtt = YT_POOL.client &.get("#{url}&format=vtt").body
-        if webvtt.starts_with?("<?xml")
+          .gsub(/([0-9:.]{12} --> [0-9:.]{12}).+/, "\\1")
          webvtt = caption.timedtext_to_vtt(webvtt)
        else
          webvtt = YT_POOL.client &.get("#{url}&format=vtt").body
            .gsub(/([0-9:.]{12} --> [0-9:.]{12}).+/, "\\1")
        end
      end
    end
--- a/src/invidious/videos.cr
+++ b/src/invidious/videos.cr
@ -24,7 +24,7 @@ struct Video
  property updated : Time
  @[DB::Field(ignore: true)]
-  @captions = [] of Invidious::Videos::Captions::Metadata
+  @captions = [] of Invidious::Videos::Caption
  @[DB::Field(ignore: true)]
  property adaptive_fmts : Array(Hash(String, JSON::Any))?
@ -215,9 +215,9 @@ struct Video
    keywords.includes? "YouTube Red"
  end
-  def captions : Array(Invidious::Videos::Captions::Metadata)
+  def captions : Array(Invidious::Videos::Caption)
    if @captions.empty? && @info.has_key?("captions")
-      @captions = Invidious::Videos::Captions::Metadata.from_yt_json(info["captions"])
+      @captions = Invidious::Videos::Caption.from_yt_json(info["captions"])
    end
    return @captions
--- a/src/invidious/videos/caption.cr
+++ b/src/invidious/videos/caption.cr
@ -1,106 +1,100 @@
 require "json"
 module Invidious::Videos
-  module Captions
+  struct Caption
-    struct Metadata
+    property name : String
-      property name : String
+    property language_code : String
-      property language_code : String
+    property base_url : String
      property base_url : String
-      property auto_generated : Bool
+    def initialize(@name, @language_code, @base_url)
    end
-      def initialize(@name, @language_code, @base_url, @auto_generated)
+    # Parse the JSON structure from Youtube
    def self.from_yt_json(container : JSON::Any) : Array(Caption)
      caption_tracks = container
        .dig?("playerCaptionsTracklistRenderer", "captionTracks")
        .try &.as_a
      captions_list = [] of Caption
      return captions_list if caption_tracks.nil?
      caption_tracks.each do |caption|
        name = caption["name"]["simpleText"]? || caption["name"]["runs"][0]["text"]
        name = name.to_s.split(" - ")[0]
        language_code = caption["languageCode"].to_s
        base_url = caption["baseUrl"].to_s
        captions_list << Caption.new(name, language_code, base_url)
      end
-      # Parse the JSON structure from Youtube
+      return captions_list
-      def self.from_yt_json(container : JSON::Any) : Array(Captions::Metadata)
+    end
        caption_tracks = container
          .dig?("playerCaptionsTracklistRenderer", "captionTracks")
          .try &.as_a
-        captions_list = [] of Captions::Metadata
+    def timedtext_to_vtt(timedtext : String, tlang = nil) : String
-        return captions_list if caption_tracks.nil?
+      # In the future, we could just directly work with the url. This is more of a POC
      cues = [] of XML::Node
      tree = XML.parse(timedtext)
      tree = tree.children.first
-        caption_tracks.each do |caption|
+      tree.children.each do |item|
-          name = caption["name"]["simpleText"]? || caption["name"]["runs"][0]["text"]
+        if item.name == "body"
-          name = name.to_s.split(" - ")[0]
+          item.children.each do |cue|
-
+            if cue.name == "p" && !(cue.children.size == 1 && cue.children[0].content == "\n")
-          language_code = caption["languageCode"].to_s
+              cues << cue
          base_url = caption["baseUrl"].to_s
          auto_generated = (caption["kind"]? == "asr")
          captions_list << Captions::Metadata.new(name, language_code, base_url, auto_generated)
        end
        return captions_list
      end
      def timedtext_to_vtt(timedtext : String, tlang = nil) : String
        # In the future, we could just directly work with the url. This is more of a POC
        cues = [] of XML::Node
        tree = XML.parse(timedtext)
        tree = tree.children.first
        tree.children.each do |item|
          if item.name == "body"
            item.children.each do |cue|
              if cue.name == "p" && !(cue.children.size == 1 && cue.children[0].content == "\n")
                cues << cue
              end
            end
            break
          end
          break
        end
        result = String.build do |result|
          result << <<-END_VTT
          WEBVTT
          Kind: captions
          Language: #{tlang || @language_code}
          END_VTT
          result << "\n\n"
          cues.each_with_index do |node, i|
            start_time = node["t"].to_f.milliseconds
            duration = node["d"]?.try &.to_f.milliseconds
            duration ||= start_time
            if cues.size > i + 1
              end_time = cues[i + 1]["t"].to_f.milliseconds
            else
              end_time = start_time + duration
            end
            # start_time
            result << start_time.hours.to_s.rjust(2, '0')
            result << ':' << start_time.minutes.to_s.rjust(2, '0')
            result << ':' << start_time.seconds.to_s.rjust(2, '0')
            result << '.' << start_time.milliseconds.to_s.rjust(3, '0')
            result << " --> "
            # end_time
            result << end_time.hours.to_s.rjust(2, '0')
            result << ':' << end_time.minutes.to_s.rjust(2, '0')
            result << ':' << end_time.seconds.to_s.rjust(2, '0')
            result << '.' << end_time.milliseconds.to_s.rjust(3, '0')
            result << "\n"
            node.children.each do |s|
              result << s.content
            end
            result << "\n"
            result << "\n"
          end
        end
        return result
      end
      result = String.build do |result|
        result << <<-END_VTT
        WEBVTT
        Kind: captions
        Language: #{tlang || @language_code}
        END_VTT
        result << "\n\n"
        cues.each_with_index do |node, i|
          start_time = node["t"].to_f.milliseconds
          duration = node["d"]?.try &.to_f.milliseconds
          duration ||= start_time
          if cues.size > i + 1
            end_time = cues[i + 1]["t"].to_f.milliseconds
          else
            end_time = start_time + duration
          end
          # start_time
          result << start_time.hours.to_s.rjust(2, '0')
          result << ':' << start_time.minutes.to_s.rjust(2, '0')
          result << ':' << start_time.seconds.to_s.rjust(2, '0')
          result << '.' << start_time.milliseconds.to_s.rjust(3, '0')
          result << " --> "
          # end_time
          result << end_time.hours.to_s.rjust(2, '0')
          result << ':' << end_time.minutes.to_s.rjust(2, '0')
          result << ':' << end_time.seconds.to_s.rjust(2, '0')
          result << '.' << end_time.milliseconds.to_s.rjust(3, '0')
          result << "\n"
          node.children.each do |s|
            result << s.content
          end
          result << "\n"
          result << "\n"
        end
      end
      return result
    end
    # List of all caption languages available on Youtube.
--- a/src/invidious/videos/transcript.cr
+++ b/src/invidious/videos/transcript.cr
@ -1,103 +0,0 @@
 module Invidious::Videos
  # Namespace for methods primarily relating to Transcripts
  module Transcript
    record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
    def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
      kind = auto_generated ? "asr" : ""
      object = {
        "1:0:string" => video_id,
        "2:base64" => {
          "1:string" => kind,
          "2:string" => language_code,
          "3:string" => "",
        },
        "3:varint" => 1_i64,
        "5:string" => "engagement-panel-searchable-transcript-search-panel",
        "6:varint" => 1_i64,
        "7:varint" => 1_i64,
        "8:varint" => 1_i64,
      }
      params = object.try { |i| Protodec::Any.cast_json(i) }
        .try { |i| Protodec::Any.from_json(i) }
        .try { |i| Base64.urlsafe_encode(i) }
        .try { |i| URI.encode_www_form(i) }
      return params
    end
    def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
      # Convert into array of TranscriptLine
      lines = self.parse(initial_data)
      # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
      vtt = String.build do |vtt|
        vtt << <<-END_VTT
        WEBVTT
        Kind: captions
        Language: #{target_language}
        END_VTT
        vtt << "\n\n"
        lines.each do |line|
          start_time = line.start_ms
          end_time = line.end_ms
          # start_time
          vtt << start_time.hours.to_s.rjust(2, '0')
          vtt << ':' << start_time.minutes.to_s.rjust(2, '0')
          vtt << ':' << start_time.seconds.to_s.rjust(2, '0')
          vtt << '.' << start_time.milliseconds.to_s.rjust(3, '0')
          vtt << " --> "
          # end_time
          vtt << end_time.hours.to_s.rjust(2, '0')
          vtt << ':' << end_time.minutes.to_s.rjust(2, '0')
          vtt << ':' << end_time.seconds.to_s.rjust(2, '0')
          vtt << '.' << end_time.milliseconds.to_s.rjust(3, '0')
          vtt << "\n"
          vtt << line.line
          vtt << "\n"
          vtt << "\n"
        end
      end
      return vtt
    end
    private def self.parse(initial_data : Hash(String, JSON::Any))
      body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
        "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
        "initialSegments").as_a
      lines = [] of TranscriptLine
      body.each do |line|
        # Transcript section headers. They are not apart of the captions and as such we can safely skip them.
        if line.as_h.has_key?("transcriptSectionHeaderRenderer")
          next
        end
        line = line["transcriptSegmentRenderer"]
        start_ms = line["startMs"].as_s.to_i.millisecond
        end_ms = line["endMs"].as_s.to_i.millisecond
        text = extract_text(line["snippet"]) || ""
        lines << TranscriptLine.new(start_ms, end_ms, text)
      end
      return lines
    end
  end
 end
--- a/src/invidious/views/playlist.ecr
+++ b/src/invidious/views/playlist.ecr
@ -70,12 +70,7 @@
            </b>
        <% else %>
            <b>
-                <% if !author.empty? %>
+                <a href="/channel/<%= playlist.ucid %>"><%= author %></a> |
                    <a href="/channel/<%= playlist.ucid %>"><%= author %></a> |
                <% elsif !playlist.subtitle.nil? %>
                    <% subtitle = playlist.subtitle || "" %>
                    <span><%= HTML.escape(subtitle[0..subtitle.rindex(" • ") || subtitle.size]) %></span> |
                <% end %>
                <%= translate_count(locale, "generic_videos_count", playlist.video_count) %> |
                <%= translate(locale, "Updated `x` ago", recode_date(playlist.updated, locale)) %>
            </b>
--- a/src/invidious/views/user/preferences.ecr
+++ b/src/invidious/views/user/preferences.ecr
@ -89,7 +89,7 @@
                <label for="captions[0]"><%= translate(locale, "preferences_captions_label") %></label>
                <% preferences.captions.each_with_index do |caption, index| %>
                    <select class="pure-u-1-6" name="captions[<%= index %>]" id="captions[<%= index %>]">
-                        <% Invidious::Videos::Captions::LANGUAGES.each do |option| %>
+                        <% Invidious::Videos::Caption::LANGUAGES.each do |option| %>
                            <option value="<%= option %>" <% if preferences.captions[index] == option %> selected <% end %>><%= translate(locale, option.blank? ? "none" : option) %></option>
                        <% end %>
                    </select>
--- a/src/invidious/yt_backend/youtube_api.cr
+++ b/src/invidious/yt_backend/youtube_api.cr
@ -557,30 +557,6 @@ module YoutubeAPI
    return self._post_json("/youtubei/v1/search", data, client_config)
  end
  ####################################################################
  # get_transcript(params, client_config?)
  #
  # Requests the youtubei/v1/get_transcript endpoint with the required headers
  # and POST data in order to get a JSON reply.
  #
  # The requested data is a specially encoded protobuf string that denotes the specific language requested.
  #
  # An optional ClientConfig parameter can be passed, too (see
  # `struct ClientConfig` above for more details).
  #
  def get_transcript(
    params : String,
    client_config : ClientConfig | Nil = nil
  ) : Hash(String, JSON::Any)
    data = {
      "context" => self.make_context(client_config),
      "params"  => params,
    }
    return self._post_json("/youtubei/v1/get_transcript", data, client_config)
  end
  ####################################################################
  # _post_json(endpoint, data, client_config?)
  #