#include "ad_match.h"
#include <maps/renderer/denormalization/lib/tasks/impl/ad_common/ad_extra/city_match.h>
#include <maps/renderer/denormalization/lib/tasks/impl/ad_common/ad_extra/names.h>
#include <maps/renderer/denormalization/lib/base/include/geometry.h>
#include <maps/libs/csv/include/output_stream.h>
#include <maps/libs/json/include/value.h>
#include <maps/libs/log8/include/log8.h>
#include <library/cpp/resource/resource.h>
#include <algorithm>
#include <fstream>
#include <set>
#include <string>
#include <unordered_map>
namespace maps::renderer::denormalization {
namespace {
template<class T>
T getOrDefault(const json::Value& value, const std::string& field, const T& defaultValue = T())
{
return value.hasField(field) ? value[field].as<T>() : defaultValue;
}
std::vector<CityDescription> loadWikiCities()
{
auto json = json::Value::fromString(NResource::Find("wiki_cities.json"));
std::vector<CityDescription> result;
for (const auto& city: json) {
result.push_back(CityDescription{
std::stoull(city["id"].as<std::string>().substr(1)), // Q prefix
city["isocode"].as<std::string>(),
*geom::transformToMercator(*geom::readWktGeometry(city["location"].as<std::string>()))
->getCoordinate(),
getOrDefault<double>(city, "area", 0.0) * 1000 * 1000, // to m^2
Names{
getOrDefault<std::string>(city, "label_en"),
getOrDefault<std::string>(city, "label_ru"),
getOrDefault<std::string>(city, "label_lo"),
},
AdCityStats{
getOrDefault<size_t>(city, "population", 0),
getOrDefault<size_t>(
city, "population_metro", getOrDefault<size_t>(city, "population", 0)),
getOrDefault<size_t>(city, "visits", 0),
getOrDefault<size_t>(city, "quality", 0),
},
});
}
return result;
}
size_t adPopulationQuality(size_t populationValue)
{
static std::unordered_map<size_t, size_t> qualityMap = {
{0, POPULATION_QUALITY_INVALID},
{400, POPULATION_QUALITY_INVALID},
{8000, POPULATION_QUALITY_INVALID},
{2250000, POPULATION_QUALITY_INVALID},
{2000, POPULATION_QUALITY_APPROXIMATED},
{4000, POPULATION_QUALITY_APPROXIMATED},
{30000, POPULATION_QUALITY_APPROXIMATED},
{40000, POPULATION_QUALITY_APPROXIMATED},
{60000, POPULATION_QUALITY_APPROXIMATED},
{200000, POPULATION_QUALITY_APPROXIMATED},
{700000, POPULATION_QUALITY_APPROXIMATED},
};
if (qualityMap.count(populationValue)) {
return qualityMap[populationValue];
}
if (populationValue % 100 == 0) {
return POPULATION_QUALITY_DOUBTFUL;
}
return POPULATION_QUALITY_GOOD;
}
CityDescription processInputElement(const InputAdExtraElement& element)
{
return CityDescription{
element.adId,
element.isocode,
element.point,
element.area,
Names{element.name, element.nameLo},
AdCityStats{
element.population, element.population, 0, adPopulationQuality(element.population)}};
}
void filterByIsocodes(std::vector<CityDescription>& cities, const std::set<std::string>& isocodes)
{
cities.erase(
std::remove_if(
cities.begin(),
cities.end(),
[&isocodes](const CityDescription& city) { return !isocodes.count(city.isocode); }),
cities.end());
}
std::set<std::string> makeIsocodes(const std::vector<CityDescription>& cities)
{
std::set<std::string> result;
for (const auto& city: cities) {
result.insert(city.isocode);
}
return result;
}
struct PopulationException {
std::string isocode;
int64_t adId;
size_t population;
size_t populationMetro;
};
const std::vector<PopulationException> POPULATION_EXCEPTIONS = {
{"UA", 1444484334, 47500, 47500}, // Pripyat
{"UA", 1444484320, 10800, 10800}, // Chernobyl
{"RU", 53117933, 623736, 1088177}, // Irkutsk
{"RU", 53119697, 7818, 10001}, // Magas
{"IN", 1553193199, 16787941, 26454000}, // Delhi
{"IN", 2021813555, 16787941, 26454000}, // New Delhi
// MAPSRENDER-2761
{"UZ", 771284585, 441700, 703517}, // Andijon shahri
{"UZ", 1508545260, 4549, 4549}, // Andijon shaharchasi
};
// MAPSRENDER-1805: ad with level_kind=7 (BLOCK) and disp_class=4 (PLUS1)
constexpr size_t FORMER_CITY_POPULATION = 1000;
void applyPopulationExceptions(
const std::set<std::string>& isocodes,
const InputAdExtraElements& inputElements,
std::unordered_map<int64_t, AdExtraProperties>& adExtraProperties)
{
for (const auto& ad: POPULATION_EXCEPTIONS) {
if (adExtraProperties.count(ad.adId) && isocodes.count(ad.isocode)) {
adExtraProperties[ad.adId].stats.population = ad.population;
adExtraProperties[ad.adId].stats.populationMetro = ad.populationMetro;
}
}
for (const auto& ad: inputElements) {
if (ad.levelKind == ymapsdf::ad::LevelKind::Block &&
ad.dispClass == ymapsdf::DisplayClass::Plus1) {
auto& adStats = adExtraProperties[ad.adId].stats;
if (adStats.population == 0) {
adStats.population = FORMER_CITY_POPULATION;
adStats.populationMetro = FORMER_CITY_POPULATION;
}
}
}
}
// In findMatches we assumed that there are no duplicates in our data
// (or we should select only one city in such cases).
// That's not true for disputed cities, so we fix them here.
// ex. Tskhinvali, Sukhumi
void fixDisputedCities(std::vector<MatchResult>& matches, const InputAdExtraElements& inputElements)
{
std::set<int64_t> disputedIds;
for (const auto& element: inputElements) {
if (element.disputed) {
disputedIds.insert(element.adId);
}
}
using MatchResultPtr = MatchResult*;
std::vector<MatchResultPtr> disputedSources;
std::vector<MatchResultPtr> disputedDestinations;
for (auto& match: matches) {
if (disputedIds.count(match.our->id)) {
if (match.their) {
disputedSources.push_back(&match);
} else {
disputedDestinations.push_back(&match);
}
}
}
DEBUG() << "disputed sources: " << disputedSources.size();
DEBUG() << "disputed destinations: " << disputedDestinations.size();
for (auto& destination: disputedDestinations) {
MatchResultPtr best = nullptr;
double bestDistance = std::numeric_limits<double>::infinity();
for (auto& source: disputedSources) {
double dist = destination->our->point.distance(source->our->point);
if (!best || dist < bestDistance) {
best = source;
bestDistance = dist;
}
}
if (best && distance(destination->our->names, best->our->names) <= 1) {
destination->their = best->their;
destination->properties = best->properties;
}
}
}
void writeDump(const std::vector<CityDescription>& elements, const std::string& fileName)
{
std::ofstream file{fileName};
csv::OutputStream writer{file};
for (const auto& element: elements) {
// clang-format off
writer << element.id
<< element.names.nameEn
<< element.names.nameRu
<< element.names.nameLo
<< element.stats.population
<< element.stats.populationMetro
<< element.stats.wikiVisits
<< element.point.x
<< element.point.y;
// clang-format on
writer.endLine();
}
}
void writeMatches(const std::vector<MatchResult>& matches, const std::string& fileName)
{
std::ofstream file{fileName};
csv::OutputStream writer{file};
for (const auto& match: matches) {
if (!match.their) {
continue;
}
// clang-format off
writer << match.their->id
<< match.our->id
<< match.their->names.nameEn
<< match.our->names.nameEn
<< match.their->names.nameRu
<< match.our->names.nameRu
<< match.their->names.nameLo
<< match.our->names.nameLo
<< match.properties.matchNameDistance
<< match.properties.stats.population
<< match.properties.stats.populationMetro
<< match.properties.stats.populationQuality
<< match.properties.stats.wikiVisits
<< match.our->stats.population
<< match.our->stats.populationQuality
<< match.their->point.x
<< match.their->point.y
<< match.our->point.x
<< match.our->point.y;
// clang-format on
writer.endLine();
}
}
} // namespace
std::unordered_map<int64_t, AdExtraProperties> matchExtraProperties(
const InputAdExtraElements& inputElements)
{
INFO() << "loading ad elements";
std::vector<CityDescription> adElements;
std::transform(
inputElements.begin(),
inputElements.end(),
std::back_inserter(adElements),
processInputElement);
INFO() << "parsing wiki cities";
auto wikiCities = loadWikiCities();
auto isocodes = makeIsocodes(adElements);
filterByIsocodes(wikiCities, isocodes);
INFO() << "matching cities";
auto matches = findMatches(wikiCities, adElements);
fixDisputedCities(matches, inputElements);
applyPopulationThreshold(matches);
if (std::getenv("ENABLE_DEBUG_DUMP")) {
writeDump(adElements, "ad_cities.csv");
writeDump(wikiCities, "wiki_cities.csv");
writeMatches(matches, "matches.csv");
}
std::unordered_map<int64_t, AdExtraProperties> result;
for (const auto& match: matches) {
result[match.our->id] = match.properties;
}
applyPopulationExceptions(isocodes, inputElements, result);
return result;
}
} // namespace maps::renderer::denormalization
Import status: Running file import (still 2 documents to extract)
Because of yet running and open tasks like text extraction and analysis maybe not all results were found yet, since at the moment of this search 2 file(s) could be only searched, overviewed and filtered by their file names only, not yet by their content and/or content based facets/filters!
You can prioritize the import(s) of (a) not yet processed file(s) by click on "Prioritize import" in the list view.
2022-02-24T04:00:00Z
ad_match.cpp
Hashtags
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
- #include
Filename extension
cpp
etl_file_b:
1
1
etl_enhance_mapping_id_time_millis_i:
0
0
etl_enhance_mapping_id_b:
1
1
etl_filter_blacklist_time_millis_i:
0
0
etl_filter_blacklist_b:
1
1
etl_filter_file_not_modified_time_millis_i:
4
4
etl_filter_file_not_modified_b:
1
1
etl_enhance_extract_text_tika_server_ocr_enabled_b:
1
1
etl_count_images_yet_no_ocr_i:
0
0
etl_enhance_ocr_descew_b:
1
1
etl_enhance_pdf_ocr_b:
1
1
etl_enhance_extract_text_tika_server_time_millis_i:
4
4
etl_enhance_extract_text_tika_server_b:
1
1
etl_enhance_detect_language_tika_server_time_millis_i:
2
2
etl_enhance_detect_language_tika_server_b:
1
1
etl_enhance_contenttype_group_time_millis_i:
0
0
etl_enhance_contenttype_group_b:
1
1
etl_enhance_pst_time_millis_i:
0
0
etl_enhance_pst_b:
1
1
etl_enhance_csv_time_millis_i:
0
0
etl_enhance_csv_b:
1
1
etl_enhance_file_mtime_time_millis_i:
0
0
etl_enhance_file_mtime_b:
1
1
etl_enhance_path_time_millis_i:
0
0
etl_enhance_path_b:
1
1
etl_enhance_extract_hashtags_time_millis_i:
0
0
etl_enhance_extract_hashtags_b:
1
1
etl_enhance_warc_time_millis_i:
0
0
etl_enhance_warc_b:
1
1
etl_enhance_zip_time_millis_i:
0
0
etl_enhance_zip_b:
1
1
etl_clean_title_time_millis_i:
0
0
etl_clean_title_b:
1
1
etl_enhance_annotations_time_millis_i:
3
3
etl_enhance_annotations_b:
1
1
etl_enhance_multilingual_time_millis_i:
0
0
etl_enhance_multilingual_b:
1
1
etl_enhance_regex_time_millis_i:
1
1
etl_enhance_regex_b:
1
1
etl_time_millis_i:
18
18
X-TIKA_Parsed-By:
- org.apache.tika.parser.CompositeParser
- org.apache.tika.parser.DefaultParser
- org.apache.tika.parser.csv.TextAndCSVParser
X-TIKA_Parsed-By-Full-Set:
- org.apache.tika.parser.CompositeParser
- org.apache.tika.parser.DefaultParser
- org.apache.tika.parser.csv.TextAndCSVParser
X-TIKA_content_handler:
ToTextContentHandler
ToTextContentHandler
X-TIKA_parse_time_millis:
1
1
X-TIKA_embedded_depth:
0
0
etl_enhance_pdf_ocr_time_millis_i:
0
0