]> cat aescling's git repositories - mastodon.git/commitdiff
Change how hashtags are normalized (#18795)
authorEugen Rochko <eugen@zeonfederated.com>
Wed, 13 Jul 2022 13:03:28 +0000 (15:03 +0200)
committeraescling <aescling+gitlab@cat.family>
Mon, 5 Sep 2022 04:27:55 +0000 (00:27 -0400)
* Change how hashtags are normalized

* Fix tests

29 files changed:
app/controllers/admin/tags_controller.rb
app/controllers/api/v1/featured_tags_controller.rb
app/controllers/settings/featured_tags_controller.rb
app/javascript/mastodon/actions/compose.js
app/lib/ascii_folding.rb [new file with mode: 0644]
app/lib/hashtag_normalizer.rb [new file with mode: 0644]
app/models/account.rb
app/models/custom_filter.rb
app/models/custom_filter_keyword.rb
app/models/featured_tag.rb
app/models/tag.rb
app/serializers/activitypub/hashtag_serializer.rb
app/serializers/rest/featured_tag_serializer.rb
app/serializers/rest/tag_serializer.rb
app/views/accounts/show.html.haml
app/views/accounts/show.rss.ruby
app/views/admin/tags/show.html.haml
app/views/admin/trends/tags/_tag.html.haml
app/views/admin_mailer/_new_trending_tags.text.erb
app/views/settings/featured_tags/index.html.haml
app/views/tags/_og.html.haml
app/views/tags/show.html.haml
app/views/tags/show.rss.ruby
config/initializers/inflections.rb
db/migrate/20220710102457_add_display_name_to_tags.rb [new file with mode: 0644]
db/schema.rb
spec/lib/hashtag_normalizer_spec.rb [new file with mode: 0644]
spec/models/tag_spec.rb
streaming/index.js

index 749e2f144d3a2bb53e25e578420966cc33278e7b..4f727c398a0be0128e853458f06d052ec1b95257 100644 (file)
@@ -16,6 +16,8 @@ module Admin
       if @tag.update(tag_params.merge(reviewed_at: Time.now.utc))
         redirect_to admin_tag_path(@tag.id), notice: I18n.t('admin.tags.updated_msg')
       else
+        @time_period = (6.days.ago.to_date...Time.now.utc.to_date)
+
         render :show
       end
     end
@@ -27,7 +29,7 @@ module Admin
     end
 
     def tag_params
-      params.require(:tag).permit(:name, :trendable, :usable, :listable)
+      params.require(:tag).permit(:name, :display_name, :trendable, :usable, :listable)
     end
   end
 end
index e4e836c9711dd4ab4fb346990d5611a95d5e5690..c1ead4f5405843baf0d12fea0b6de3aefcd51f6e 100644 (file)
@@ -13,9 +13,7 @@ class Api::V1::FeaturedTagsController < Api::BaseController
   end
 
   def create
-    @featured_tag = current_account.featured_tags.new(featured_tag_params)
-    @featured_tag.reset_data
-    @featured_tag.save!
+    @featured_tag = current_account.featured_tags.create!(featured_tag_params)
     render json: @featured_tag, serializer: REST::FeaturedTagSerializer
   end
 
index e805527d07c63ad3b95ee18b0465a408d587234f..aadff7c835171ac2fec6a6ff09ce4b8182550890 100644 (file)
@@ -11,7 +11,6 @@ class Settings::FeaturedTagsController < Settings::BaseController
 
   def create
     @featured_tag = current_account.featured_tags.new(featured_tag_params)
-    @featured_tag.reset_data
 
     if @featured_tag.save
       redirect_to settings_featured_tags_path
index 878011fc0cd474ae95a4e726eae9cdd2973c95ec..cffd032b387fc2562b678fc3c58092c41f687bc7 100644 (file)
@@ -608,7 +608,20 @@ function insertIntoTagHistory(recognizedTags, text) {
     const state = getState();
     const oldHistory = state.getIn(['compose', 'tagHistory']);
     const me = state.getIn(['meta', 'me']);
-    const names = recognizedTags.map(tag => text.match(new RegExp(`#${tag.name}`, 'i'))[0].slice(1));
+
+    // FIXME: Matching input hashtags with recognized hashtags has become more
+    // complicated because of new normalization rules, it's no longer just
+    // a case sensitivity issue
+    const names = recognizedTags.map(tag => {
+      const matches = text.match(new RegExp(`#${tag.name}`, 'i'));
+
+      if (matches && matches.length > 0) {
+        return matches[0].slice(1);
+      } else {
+        return tag.name;
+      }
+    });
+
     const intersectedOldHistory = oldHistory.filter(name => names.findIndex(newName => newName.toLowerCase() === name.toLowerCase()) === -1);
 
     names.push(...intersectedOldHistory.toJS());
diff --git a/app/lib/ascii_folding.rb b/app/lib/ascii_folding.rb
new file mode 100644 (file)
index 0000000..1798d3d
--- /dev/null
@@ -0,0 +1,10 @@
+# frozen_string_literal: true
+
+class ASCIIFolding
+  NON_ASCII_CHARS        = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž'
+  EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz'
+
+  def fold(str)
+    str.tr(NON_ASCII_CHARS, EQUIVALENT_ASCII_CHARS)
+  end
+end
diff --git a/app/lib/hashtag_normalizer.rb b/app/lib/hashtag_normalizer.rb
new file mode 100644 (file)
index 0000000..c1f99e1
--- /dev/null
@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+
+class HashtagNormalizer
+  def normalize(str)
+    remove_invalid_characters(ascii_folding(lowercase(cjk_width(str))))
+  end
+
+  private
+
+  def remove_invalid_characters(str)
+    str.gsub(/[^[:alnum:]#{Tag::HASHTAG_SEPARATORS}]/, '')
+  end
+
+  def ascii_folding(str)
+    ASCIIFolding.new.fold(str)
+  end
+
+  def lowercase(str)
+    str.mb_chars.downcase.to_s
+  end
+
+  def cjk_width(str)
+    str.unicode_normalize(:nfkc)
+  end
+end
index fb916222371b79b5702de61780eab085c70c8e3f..19fd34ff92157b67676ca917056468d65483ad9a 100644 (file)
@@ -62,7 +62,7 @@ class Account < ApplicationRecord
   )
 
   USERNAME_RE   = /[a-z0-9_]+([a-z0-9_\.-]+[a-z0-9_]+)?/i
-  MENTION_RE    = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:word:]\.\-]+[[:word:]]+)?)/i
+  MENTION_RE    = /(?<=^|[^\/[:word:]])@((#{USERNAME_RE})(?:@[[:alnum:]\.\-]+[[:alnum:]]+)?)/i
   URL_PREFIX_RE = /\Ahttp(s?):\/\/[^\/]+/
 
   include Attachmentable
index e98ed7df9fbab0e2bd8ab79030fa4c53e44e9265..985eab1254cea8db40ea37bd606c01a487ac23a9 100644 (file)
@@ -3,14 +3,14 @@
 #
 # Table name: custom_filters
 #
-#  id         :bigint           not null, primary key
-#  account_id :bigint
+#  id         :bigint(8)        not null, primary key
+#  account_id :bigint(8)
 #  expires_at :datetime
 #  phrase     :text             default(""), not null
 #  context    :string           default([]), not null, is an Array
 #  created_at :datetime         not null
 #  updated_at :datetime         not null
-#  action     :integer          default(0), not null
+#  action     :integer          default("warn"), not null
 #
 
 class CustomFilter < ApplicationRecord
index bf5c5574693f3a48556715885032b56d31d31369..e0d0289ae16efc197b40b052af463a58561f9cbb 100644 (file)
@@ -3,8 +3,8 @@
 #
 # Table name: custom_filter_keywords
 #
-#  id               :bigint           not null, primary key
-#  custom_filter_id :bigint           not null
+#  id               :bigint(8)        not null, primary key
+#  custom_filter_id :bigint(8)        not null
 #  keyword          :text             default(""), not null
 #  whole_word       :boolean          default(TRUE), not null
 #  created_at       :datetime         not null
index 74d62e77781c431fbd56c4014cd4f6764d605141..c9c285bfa1bfec6453285e5dea33f6964e5f0db1 100644 (file)
 #
 
 class FeaturedTag < ApplicationRecord
-  belongs_to :account, inverse_of: :featured_tags, required: true
-  belongs_to :tag, inverse_of: :featured_tags, required: true
+  belongs_to :account, inverse_of: :featured_tags
+  belongs_to :tag, inverse_of: :featured_tags, optional: true # Set after validation
 
-  delegate :name, to: :tag, allow_nil: true
-
-  validates_associated :tag, on: :create
-  validates :name, presence: true, on: :create
+  validate :validate_tag_name, on: :create
   validate :validate_featured_tags_limit, on: :create
 
-  def name=(str)
-    self.tag = Tag.find_or_create_by_names(str.strip)&.first
+  before_create :set_tag
+  before_create :reset_data
+
+  attr_writer :name
+
+  def name
+    tag_id.present? ? tag.name : @name
   end
 
   def increment(timestamp)
@@ -34,14 +36,23 @@ class FeaturedTag < ApplicationRecord
     update(statuses_count: [0, statuses_count - 1].max, last_status_at: account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).where.not(id: deleted_status_id).select(:created_at).first&.created_at)
   end
 
+  private
+
+  def set_tag
+    self.tag = Tag.find_or_create_by_names(@name)&.first
+  end
+
   def reset_data
     self.statuses_count = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).count
     self.last_status_at = account.statuses.where(visibility: %i(public unlisted)).tagged_with(tag).select(:created_at).first&.created_at
   end
 
-  private
-
   def validate_featured_tags_limit
     errors.add(:base, I18n.t('featured_tags.errors.limit')) if account.featured_tags.count >= 10
   end
+
+  def validate_tag_name
+    errors.add(:name, :blank) if @name.blank?
+    errors.add(:name, :invalid) unless @name.match?(/\A(#{Tag::HASHTAG_NAME_RE})\z/i)
+  end
 end
index a6404261495964d3a1dcfc0e0f3c80089c2f9d26..f078007f24f1ed1eaf970e5b1bcfd0a39c587e26 100644 (file)
@@ -15,6 +15,7 @@
 #  last_status_at      :datetime
 #  max_score           :float
 #  max_score_at        :datetime
+#  display_name        :string
 #
 
 class Tag < ApplicationRecord
@@ -24,11 +25,12 @@ class Tag < ApplicationRecord
   has_many :featured_tags, dependent: :destroy, inverse_of: :tag
 
   HASHTAG_SEPARATORS = "_\u00B7\u200c"
-  HASHTAG_NAME_RE    = "([[:word:]_][[:word:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:word:]#{HASHTAG_SEPARATORS}]*[[:word:]_])|([[:word:]_]*[[:alpha:]][[:word:]_]*)"
+  HASHTAG_NAME_RE    = "([[:alnum:]_][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alpha:]#{HASHTAG_SEPARATORS}][[:alnum:]#{HASHTAG_SEPARATORS}]*[[:alnum:]_])|([[:alnum:]_]*[[:alpha:]][[:alnum:]_]*)"
   HASHTAG_RE         = /(?:^|[^\/\)\w])#(#{HASHTAG_NAME_RE})/i
 
   validates :name, presence: true, format: { with: /\A(#{HASHTAG_NAME_RE})\z/i }
   validate :validate_name_change, if: -> { !new_record? && name_changed? }
+  validate :validate_display_name_change, if: -> { !new_record? && display_name_changed? }
 
   scope :reviewed, -> { where.not(reviewed_at: nil) }
   scope :unreviewed, -> { where(reviewed_at: nil) }
@@ -46,6 +48,10 @@ class Tag < ApplicationRecord
     name
   end
 
+  def display_name
+    attributes['display_name'] || name
+  end
+
   def usable
     boolean_with_default('usable', true)
   end
@@ -90,8 +96,10 @@ class Tag < ApplicationRecord
 
   class << self
     def find_or_create_by_names(name_or_names)
-      Array(name_or_names).map(&method(:normalize)).uniq { |str| str.mb_chars.downcase.to_s }.map do |normalized_name|
-        tag = matching_name(normalized_name).first || create(name: normalized_name)
+      names = Array(name_or_names).map { |str| [normalize(str), str] }.uniq(&:first)
+
+      names.map do |(normalized_name, display_name)|
+        tag = matching_name(normalized_name).first || create(name: normalized_name, display_name: display_name)
 
         yield tag if block_given?
 
@@ -129,7 +137,7 @@ class Tag < ApplicationRecord
     end
 
     def normalize(str)
-      str.gsub(/\A#/, '')
+      HashtagNormalizer.new.normalize(str)
     end
   end
 
@@ -138,4 +146,8 @@ class Tag < ApplicationRecord
   def validate_name_change
     errors.add(:name, I18n.t('tags.does_not_match_previous_name')) unless name_was.mb_chars.casecmp(name.mb_chars).zero?
   end
+
+  def validate_display_name_change
+    errors.add(:display_name, I18n.t('tags.does_not_match_previous_name')) unless HashtagNormalizer.new.normalize(display_name).casecmp(name.mb_chars).zero?
+  end
 end
index 1a56e4dfe4c745f375bacb3b6001d7b6d8d95291..90929c57f9dbd05d7146604905a8d23d45af600f 100644 (file)
@@ -10,11 +10,11 @@ class ActivityPub::HashtagSerializer < ActivityPub::Serializer
   end
 
   def name
-    "##{object.name}"
+    "##{object.display_name}"
   end
 
   def href
-    if object.class.name == 'FeaturedTag'
+    if object.instance_of?(FeaturedTag)
       short_account_tag_url(object.account, object.tag)
     else
       tag_url(object)
index 96adcc7d09345a0405adf9eaf5b795f3e858aea7..8abcd9b90fe362e2bdc50c303a80a74e625d75da 100644 (file)
@@ -12,4 +12,8 @@ class REST::FeaturedTagSerializer < ActiveModel::Serializer
   def url
     short_account_tag_url(object.account, object.tag)
   end
+
+  def name
+    object.display_name
+  end
 end
index 74aa571a4c189655154b95ebf07b768e12e965d8..52bfaa4ce4e12bb614cf8579d6e31d4f8494ff5d 100644 (file)
@@ -8,4 +8,8 @@ class REST::TagSerializer < ActiveModel::Serializer
   def url
     tag_url(object)
   end
+
+  def name
+    object.display_name
+  end
 end
index 72e9c661110834ac6874cc1486df3c6904e9ae8a..7fa688bd3552ec4915896768b037f2e48480dc0f 100644 (file)
@@ -75,7 +75,7 @@
           = link_to short_account_tag_path(@account, featured_tag.tag) do
             %h4
               = fa_icon 'hashtag'
-              = featured_tag.name
+              = featured_tag.display_name
               %small
                 - if featured_tag.last_status_at.nil?
                   = t('accounts.nothing_here')
index fd45a8b2b09254ff1f80285280632e41a68e353e..34e29d483f7f222f0fe6cacbb6786d64cf06b020 100644 (file)
@@ -28,7 +28,7 @@ RSS::Builder.build do |doc|
       end
 
       status.tags.each do |tag|
-        item.category(tag.name)
+        item.category(tag.display_name)
       end
     end
   end
index 89e8f2b9ad7a0f38a0dfc6af85030c9606783416..71bce0c0cb20442a62fbca10f9d68c2f5083f5f7 100644 (file)
@@ -1,5 +1,5 @@
 - content_for :page_title do
-  = "##{@tag.name}"
+  = "##{@tag.display_name}"
 
 - if current_user.can?(:view_dashboard)
   - content_for :heading_actions do
@@ -50,7 +50,7 @@
   = render 'shared/error_messages', object: @tag
 
   .fields-group
-    = f.input :name, wrapper: :with_block_label
+    = f.input :display_name, wrapper: :with_block_label
 
   .fields-group
     = f.input :usable, as: :boolean, wrapper: :with_label
index 7bb99b15800956c9fc25ac96f15d7323bc679c65..a30666a08b1fc9cd3820406c4de7b0c55ecccc7d 100644 (file)
@@ -6,7 +6,7 @@
     .pending-account__header
       = link_to admin_tag_path(tag.id) do
         = fa_icon 'hashtag'
-        = tag.name
+        = tag.display_name
 
       %br/
 
index cde5af4e4e677e6bea3cc39bacd0fb86399ca14f..363df369d56743d167f6f1bc0a47fb1b5f199c2d 100644 (file)
@@ -1,12 +1,12 @@
 <%= raw t('admin_mailer.new_trends.new_trending_tags.title') %>
 
 <% @tags.each do |tag| %>
-- #<%= tag.name %>
+- #<%= tag.display_name %>
   <%= raw t('admin.trends.tags.usage_comparison', today: tag.history.get(Time.now.utc).accounts, yesterday: tag.history.get(Time.now.utc - 1.day).accounts) %> • <%= t('admin.trends.tags.current_score', score: Trends.tags.score(tag.id).round(2)) %>
 <% end %>
 
 <% if @lowest_trending_tag %>
-<%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %>
+<%= raw t('admin_mailer.new_trends.new_trending_tags.requirements', lowest_tag_name: @lowest_trending_tag.display_name, lowest_tag_score: Trends.tags.score(@lowest_trending_tag.id).round(2), rank: Trends.tags.options[:review_threshold]) %>
 <% else %>
 <%= raw t('admin_mailer.new_trends.new_trending_tags.no_approved_tags') %>
 <% end %>
index 65de7f8f30d1d67f764f9b29de94844ba74ca0c2..5d87e2862d93008bb65c153179657d144ef7b436 100644 (file)
@@ -9,7 +9,7 @@
   = render 'shared/error_messages', object: @featured_tag
 
   .fields-group
-    = f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ')
+    = f.input :name, wrapper: :with_block_label, hint: safe_join([t('simple_form.hints.featured_tag.name'), safe_join(@recently_used_tags.map { |tag| link_to("##{tag.display_name}", settings_featured_tags_path(featured_tag: { name: tag.name }), method: :post) }, ', ')], ' ')
 
   .actions
     = f.button :button, t('featured_tags.add_new'), type: :submit
index a7c289bcb001ac1adc5ba0858e8c1e7871826927..37f644cf2f91a1667e3c86a9911233ee4f186e56 100644 (file)
@@ -1,6 +1,6 @@
 = opengraph 'og:site_name', t('about.hosted_on', domain: site_hostname)
 = opengraph 'og:url', tag_url(@tag)
 = opengraph 'og:type', 'website'
-= opengraph 'og:title', "##{@tag.name}"
-= opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.name))
+= opengraph 'og:title', "##{@tag.display_name}"
+= opengraph 'og:description', strip_tags(t('about.about_hashtag_html', hashtag: @tag.display_name))
 = opengraph 'twitter:card', 'summary'
index 0e6d4c43d8241f9edb69acd6b7b8c5bbd8819bc7..608989a2bf4b9995f9a6b0def534a192bcae4ba9 100644 (file)
@@ -1,5 +1,5 @@
 - content_for :page_title do
-  = "##{@tag.name}"
+  = "##{@tag.display_name}"
 
 - content_for :header_tags do
   %meta{ name: 'robots', content: 'noindex' }/
@@ -8,8 +8,8 @@
   = render 'og'
 
 .page-header
-  %h1= "##{@tag.name}"
-  %p= t('about.about_hashtag_html', hashtag: @tag.name)
+  %h1= "##{@tag.display_name}"
+  %p= t('about.about_hashtag_html', hashtag: @tag.display_name)
 
 #mastodon-timeline{ data: { props: Oj.dump(default_props.merge(hashtag: @tag.name, local: @local)) }}
 .notranslate#modal-container
index 9ce71be74cc1b4729c4c64bef527a6be092a7a53..8e0c2327b55c746b6f13e81940bf944da70ceeb3 100644 (file)
@@ -1,6 +1,6 @@
 RSS::Builder.build do |doc|
-  doc.title("##{@tag.name}")
-  doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.name))
+  doc.title("##{@tag.display_name}")
+  doc.description(I18n.t('rss.descriptions.tag', hashtag: @tag.display_name))
   doc.link(tag_url(@tag))
   doc.last_build_date(@statuses.first.created_at) if @statuses.any?
   doc.generator("Mastodon v#{Mastodon::Version.to_s}")
@@ -26,7 +26,7 @@ RSS::Builder.build do |doc|
       end
 
       status.tags.each do |tag|
-        item.category(tag.name)
+        item.category(tag.display_name)
       end
     end
   end
index 9bc9a54b2d5757bcaa7324fdb2d4c623df408aa6..3e5a556176161ec83607359ea5fedc65fe0d7550 100644 (file)
@@ -24,6 +24,7 @@ ActiveSupport::Inflector.inflections(:en) do |inflect|
   inflect.acronym 'RSS'
   inflect.acronym 'REST'
   inflect.acronym 'URL'
+  inflect.acronym 'ASCII'
 
   inflect.singular 'data', 'data'
 end
diff --git a/db/migrate/20220710102457_add_display_name_to_tags.rb b/db/migrate/20220710102457_add_display_name_to_tags.rb
new file mode 100644 (file)
index 0000000..aa78676
--- /dev/null
@@ -0,0 +1,5 @@
+class AddDisplayNameToTags < ActiveRecord::Migration[6.1]
+  def change
+    add_column :tags, :display_name, :string
+  end
+end
index d1edcdcf0be4b65213a93e58d215e4b369f7479f..d138e4f0444b82f75291f6ed9b2bd7fabb797230 100644 (file)
@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 2022_07_04_024901) do
+ActiveRecord::Schema.define(version: 2022_07_10_102457) do
 
   # These are extensions that must be enabled in order to support this database
   enable_extension "plpgsql"
@@ -943,6 +943,7 @@ ActiveRecord::Schema.define(version: 2022_07_04_024901) do
     t.datetime "last_status_at"
     t.float "max_score"
     t.datetime "max_score_at"
+    t.string "display_name"
     t.index "lower((name)::text) text_pattern_ops", name: "index_tags_on_name_lower_btree", unique: true
   end
 
diff --git a/spec/lib/hashtag_normalizer_spec.rb b/spec/lib/hashtag_normalizer_spec.rb
new file mode 100644 (file)
index 0000000..fbb9f37
--- /dev/null
@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+
+require 'rails_helper'
+
+describe HashtagNormalizer do
+  subject { described_class.new }
+
+  describe '#normalize' do
+    it 'converts full-width Latin characters into basic Latin characters' do
+      expect(subject.normalize('Synthwave')).to eq 'synthwave'
+    end
+
+    it 'converts half-width Katakana into Kana characters' do
+      expect(subject.normalize('シーサイドライナー')).to eq 'シーサイドライナー'
+    end
+
+    it 'converts modified Latin characters into basic Latin characters' do
+      expect(subject.normalize('BLÅHAJ')).to eq 'blahaj'
+    end
+
+    it 'strips out invalid characters' do
+      expect(subject.normalize('#foo')).to eq 'foo'
+    end
+
+    it 'keeps valid characters' do
+      expect(subject.normalize('a·b')).to eq 'a·b'
+    end
+  end
+end
index 3949dbce548ee1d9507dc8d74fd8ae0d0546719b..b16f99a79956405a38fbc537ce2bc0858cb2f2cd 100644 (file)
@@ -91,7 +91,7 @@ RSpec.describe Tag, type: :model do
       upcase_string   = 'abcABCabcABCやゆよ'
       downcase_string = 'abcabcabcabcやゆよ';
 
-      tag = Fabricate(:tag, name: downcase_string)
+      tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
       expect(Tag.find_normalized(upcase_string)).to eq tag
     end
   end
@@ -101,12 +101,12 @@ RSpec.describe Tag, type: :model do
       upcase_string   = 'abcABCabcABCやゆよ'
       downcase_string = 'abcabcabcabcやゆよ';
 
-      tag = Fabricate(:tag, name: downcase_string)
+      tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
       expect(Tag.matches_name(upcase_string)).to eq [tag]
     end
 
     it 'uses the LIKE operator' do
-      expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100\\%abc%')]
+      expect(Tag.matches_name('100%abc').to_sql).to eq %q[SELECT "tags".* FROM "tags" WHERE LOWER("tags"."name") LIKE LOWER('100abc%')]
     end
   end
 
@@ -115,7 +115,7 @@ RSpec.describe Tag, type: :model do
       upcase_string   = 'abcABCabcABCやゆよ'
       downcase_string = 'abcabcabcabcやゆよ';
 
-      tag = Fabricate(:tag, name: downcase_string)
+      tag = Fabricate(:tag, name: HashtagNormalizer.new.normalize(downcase_string))
       expect(Tag.matching_name(upcase_string)).to eq [tag]
     end
   end
index 183cdf789988b5dc6f01b4e795315f84a0b103c3..ff7d48250be6530bca9c47066ffa1d6ab088e9cb 100644 (file)
@@ -900,6 +900,34 @@ const startWorker = async (workerId) => {
     return arr;
   };
 
+  /**
+   * See app/lib/ascii_folder.rb for the canon definitions
+   * of these constants
+   */
+  const NON_ASCII_CHARS        = 'ÀÁÂÃÄÅàáâãäåĀāĂ㥹ÇçĆćĈĉĊċČčÐðĎďĐđÈÉÊËèéêëĒēĔĕĖėĘęĚěĜĝĞğĠġĢģĤĥĦħÌÍÎÏìíîïĨĩĪīĬĭĮįİıĴĵĶķĸĹĺĻļĽľĿŀŁłÑñŃńŅņŇňʼnŊŋÒÓÔÕÖØòóôõöøŌōŎŏŐőŔŕŖŗŘřŚśŜŝŞşŠšſŢţŤťŦŧÙÚÛÜùúûüŨũŪūŬŭŮůŰűŲųŴŵÝýÿŶŷŸŹźŻżŽž';
+  const EQUIVALENT_ASCII_CHARS = 'AAAAAAaaaaaaAaAaAaCcCcCcCcCcDdDdDdEEEEeeeeEeEeEeEeEeGgGgGgGgHhHhIIIIiiiiIiIiIiIiIiJjKkkLlLlLlLlLlNnNnNnNnnNnOOOOOOooooooOoOoOoRrRrRrSsSsSsSssTtTtTtUUUUuuuuUuUuUuUuUuUuWwYyyYyYZzZzZz';
+
+  /**
+   * @param {string} str
+   * @return {string}
+   */
+  const foldToASCII = str => {
+    const regex = new RegExp(NON_ASCII_CHARS.split('').join('|'), 'g');
+
+    return str.replace(regex, match => {
+      const index = NON_ASCII_CHARS.indexOf(match);
+      return EQUIVALENT_ASCII_CHARS[index];
+    });
+  };
+
+  /**
+   * @param {string} str
+   * @return {string}
+   */
+  const normalizeHashtag = str => {
+    return foldToASCII(str.normalize('NFKC').toLowerCase()).replace(/[^\p{L}\p{N}_\u00b7\u200c]/gu, '');
+  };
+
   /**
    * @param {any} req
    * @param {string} name
@@ -990,7 +1018,7 @@ const startWorker = async (workerId) => {
         reject('No tag for stream provided');
       } else {
         resolve({
-          channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}`],
+          channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}`],
           options: { needsFiltering: true, allowLocalOnly: true },
         });
       }
@@ -1001,7 +1029,7 @@ const startWorker = async (workerId) => {
         reject('No tag for stream provided');
       } else {
         resolve({
-          channelIds: [`timeline:hashtag:${params.tag.toLowerCase()}:local`],
+          channelIds: [`timeline:hashtag:${normalizeHashtag(params.tag)}:local`],
           options: { needsFiltering: true, allowLocalOnly: true },
         });
       }