Skip to content

Commit

Permalink
修复一个队列重复的问题
Browse files Browse the repository at this point in the history
开始实现完全分布式框架
  • Loading branch information
邹嵩 committed Jul 25, 2018
1 parent 54b82f5 commit 254aaf5
Show file tree
Hide file tree
Showing 26 changed files with 246 additions and 18 deletions.
12 changes: 12 additions & 0 deletions DotnetSpider.sln
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
runtests.sh = runtests.sh
EndProjectSection
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotnetSpider.Node", "src\DotnetSpider.Node\DotnetSpider.Node.csproj", "{C2BAD1A6-6744-4927-B014-67647D3FAD58}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DotnetSpider.Broker", "src\DotnetSpider.Broker\DotnetSpider.Broker.csproj", "{93099A1A-128B-4023-9271-F535A11F2490}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -98,6 +102,14 @@ Global
{372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Debug|Any CPU.Build.0 = Debug|Any CPU
{372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Release|Any CPU.ActiveCfg = Release|Any CPU
{372C7A6F-E1EB-4AA6-8B31-1AE52FBDAA83}.Release|Any CPU.Build.0 = Release|Any CPU
{C2BAD1A6-6744-4927-B014-67647D3FAD58}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C2BAD1A6-6744-4927-B014-67647D3FAD58}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C2BAD1A6-6744-4927-B014-67647D3FAD58}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C2BAD1A6-6744-4927-B014-67647D3FAD58}.Release|Any CPU.Build.0 = Release|Any CPU
{93099A1A-128B-4023-9271-F535A11F2490}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{93099A1A-128B-4023-9271-F535A11F2490}.Debug|Any CPU.Build.0 = Debug|Any CPU
{93099A1A-128B-4023-9271-F535A11F2490}.Release|Any CPU.ActiveCfg = Release|Any CPU
{93099A1A-128B-4023-9271-F535A11F2490}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down
12 changes: 12 additions & 0 deletions src/DotnetSpider.Broker/DotnetSpider.Broker.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.1</TargetFramework>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Confluent.Kafka" Version="0.11.5" />
</ItemGroup>

</Project>
30 changes: 30 additions & 0 deletions src/DotnetSpider.Broker/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
using Confluent.Kafka;
using Confluent.Kafka.Serialization;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Text;

namespace DotnetSpider.Broker
{
class Program
{
static void Main(string[] args)
{
var config = new Dictionary<string, object>
{
{ "bootstrap.servers", "192.168.90.106:9092" }
};

using (var producer = new Producer<Null, string>(config, null, new StringSerializer(Encoding.UTF8)))
{
for (int i = 0; i < 1000; ++i)
{
var dr = producer.ProduceAsync("my-topic", null, "test message text").Result;
Console.WriteLine($"Delivered '{dr.Value}' to: {dr.TopicPartitionOffset}");
}
}
Console.Read();
}
}
}
2 changes: 1 addition & 1 deletion src/DotnetSpider.Core/DotnetSpider.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<TargetFrameworks>net40;net45;netstandard2.0</TargetFrameworks>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
<Version>3.0.1</Version>
<Version>3.0.2</Version>
<Authors>[email protected];</Authors>
<AssemblyName>DotnetSpider.Core</AssemblyName>
<Copyright>Copyright 2018 Lewis Zou</Copyright>
Expand Down
2 changes: 1 addition & 1 deletion src/DotnetSpider.Core/Pipeline/BasePipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace DotnetSpider.Core.Pipeline
/// <summary>
/// 数据管道抽象, 通过数据管道把解析的数据存到不同的存储中(文件、数据库)
/// </summary>
public abstract class BasePipeline : IPipeline
public abstract class BasePipeline : Named, IPipeline
{
/// <summary>
/// 处理页面解析器解析到的数据结果
Expand Down
2 changes: 1 addition & 1 deletion src/DotnetSpider.Core/Spider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ protected void VerifyDataOrGenerateReport(string[] arguments)
public Site Site
{
get => _site;
protected set { _site = value ?? throw new ArgumentException($"{nameof(Site)} should not be null."); }
set { _site = value ?? throw new ArgumentException($"{nameof(Site)} should not be null."); }
}

/// <summary>
Expand Down
23 changes: 23 additions & 0 deletions src/DotnetSpider.Extension/ConfigurableSpider.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
using DotnetSpider.Core;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace DotnetSpider.Extension
{
public class ConfigurableSpider : Spider
{
private readonly string _json;

public ConfigurableSpider(string json)
{
_json = json;
}

protected override void OnInit(params string[] arguments)
{

}
}
}
2 changes: 1 addition & 1 deletion src/DotnetSpider.Extension/DotnetSpider.Extension.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<TargetFrameworks>net40;net45;netstandard2.0</TargetFrameworks>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
<Version>3.0.1</Version>
<Version>3.0.2</Version>
<Authors>[email protected];</Authors>
<AssemblyName>DotnetSpider.Extension</AssemblyName>
<Copyright>Copyright 2018 Lewis Zou</Copyright>
Expand Down
2 changes: 1 addition & 1 deletion src/DotnetSpider.Extension/Pipeline/DbModelPipeline.cs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public abstract class DbModelPipeline : ModelPipeline

public int RetryTimes { get; set; } = 600;

public string ConnectString { get; private set; }
public string ConnectString { get; set; }

/// <summary>
/// 数据库忽略大小写
Expand Down
88 changes: 88 additions & 0 deletions src/DotnetSpider.Extension/spider.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"Model": {
"Selector": {
"Type": "XPath",
"Expression": "//div[@class='yk-pack pack-film']",
"Arguments": null
},
"Take": 0,
"TakeFromHead": true,
"Table": {
"Database": "youku",
"Name": "show",
"Postfix": "Today",
"UpdateColumns": null,
"Indexs": null,
"Uniques": null,
"FullName": "show_2018_07_25"
},
"Fields": [
{
"NotNull": false,
"Option": "None",
"Length": 255,
"Name": "name",
"IgnoreStore": false,
"DataType": "String",
"IsPrimary": false,
"Formatters": null,
"Type": "XPath",
"Expression": ".//img[@class='quic']/@alt",
"Arguments": null
},
{
"NotNull": false,
"Option": "None",
"Length": 255,
"Name": "index",
"IgnoreStore": false,
"DataType": "Int",
"IsPrimary": false,
"Formatters": null,
"Type": "Enviroment",
"Expression": "index",
"Arguments": null
},
{
"NotNull": false,
"Option": "None",
"Length": 255,
"Name": "id",
"IgnoreStore": false,
"DataType": "Int",
"IsPrimary": true,
"Formatters": null,
"Type": "Enviroment",
"Expression": "",
"Arguments": null
}
],
"TargetRequestSelectors": [
{
"XPaths": [ "//ul[@class='yk-pages']" ],
"Patterns": [ "(http|ftp|https):\\/\\/[\\w\\-_]+(\\.[\\w\\-_]+)+([\\w\\-\\.,@?^=%&amp;:/~\\+#]*[\\w\\-\\@?^=%&amp;/~\\+#])?" ]
}
],
"SharedValueSelectors": null
},
"Scheduler": {
"Name": "QueueDuplicateRemovedScheduler"
},
"Downloader": {
"Name": "HttpClientDownloader",
"AllowAutoRedirect": true
},
"Pipeline": {
"Name": "MySqlEntityPipeline",
"ConnectString": "Database='mysql';Data Source=localhost;password=;User ID=root;Port=3306;SslMode=None"
},
"ClearSchedulerAfterCompleted": true,
"StatusFlushInterval": 5000,
"PipelineRetryTimes": 2,
"PipelineCachedSize": 5,
"RedialExecutor": "MutexRedialExecutor",
"EmptySleepTime": 15000,
"ExitWhenComplete": true,
"ThreadNum": 1,
"SkipTargetRequestsWhenResultIsEmpty": true
}
2 changes: 1 addition & 1 deletion src/DotnetSpider.Extraction/DotnetSpider.Extraction.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<TargetFrameworks>net40;net45;netstandard2.0</TargetFrameworks>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
<PackageRequireLicenseAcceptance>true</PackageRequireLicenseAcceptance>
<Version>3.0.0</Version>
<Version>3.0.2</Version>
<Authors>[email protected];</Authors>
<AssemblyName>DotnetSpider.Extraction</AssemblyName>
<Copyright>Copyright 2018 Lewis Zou</Copyright>
Expand Down
3 changes: 2 additions & 1 deletion src/DotnetSpider.Extraction/Model/Attribute/FieldSelector.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using Newtonsoft.Json;
using System;

namespace DotnetSpider.Extraction.Model.Attribute
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using Newtonsoft.Json;
using System;

namespace DotnetSpider.Extraction.Model.Attribute
{
Expand Down
6 changes: 5 additions & 1 deletion src/DotnetSpider.Extraction/Model/Attribute/TableInfo.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using Newtonsoft.Json;
using System;

namespace DotnetSpider.Extraction.Model.Attribute
{
Expand All @@ -10,6 +11,9 @@ public class TableInfo : System.Attribute
{
private string _name;

[JsonIgnore]
public override object TypeId => base.TypeId;

/// <summary>
/// 数据库名
/// </summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using Newtonsoft.Json;
using System;

namespace DotnetSpider.Extraction.Model.Attribute
{
Expand All @@ -8,6 +9,9 @@ namespace DotnetSpider.Extraction.Model.Attribute
[AttributeUsage(AttributeTargets.Class, AllowMultiple = true)]
public class TargetRequestSelector : System.Attribute
{
[JsonIgnore]
public override object TypeId => base.TypeId;

public TargetRequestSelector() { }

public TargetRequestSelector(string[] xpaths, string[] patterns = null)
Expand Down
6 changes: 5 additions & 1 deletion src/DotnetSpider.Extraction/Model/Attribute/ToNext.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using Newtonsoft.Json;
using System;

namespace DotnetSpider.Extraction.Model.Attribute
{
Expand All @@ -8,6 +9,9 @@ namespace DotnetSpider.Extraction.Model.Attribute
[AttributeUsage(AttributeTargets.Property, AllowMultiple = true)]
public class ToNext : System.Attribute
{
[JsonIgnore]
public override object TypeId => base.TypeId;

/// <summary>
/// 保存到起始链接的额外信息
/// </summary>
Expand Down
5 changes: 4 additions & 1 deletion src/DotnetSpider.Extraction/Model/DataType.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
using System;
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace DotnetSpider.Extraction.Model
{
[JsonConverter(typeof(StringEnumConverter))]
public enum DataType
{
None,
Expand Down
5 changes: 4 additions & 1 deletion src/DotnetSpider.Extraction/Model/FieldOptions.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
using System;
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
Expand All @@ -8,6 +10,7 @@ namespace DotnetSpider.Extraction.Model
/// <summary>
/// 额外选项的定义
/// </summary>
[JsonConverter(typeof(StringEnumConverter))]
public enum FieldOptions
{
/// <summary>
Expand Down
2 changes: 2 additions & 0 deletions src/DotnetSpider.Extraction/Model/ModelDefinition.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using DotnetSpider.Extraction.Model.Attribute;
using Newtonsoft.Json;
using System;
using System.Collections.Generic;
using System.Linq;
Expand Down Expand Up @@ -43,6 +44,7 @@ public class ModelDefinition : IModel
/// </summary>
public IEnumerable<SharedValueSelector> SharedValueSelectors { get; protected set; }

[JsonIgnore]
public string Identity { get; protected set; }

public ModelDefinition(Selector selector, IEnumerable<FieldSelector> fields, TableInfo table,
Expand Down
8 changes: 7 additions & 1 deletion src/DotnetSpider.Extraction/Model/Selector.cs
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
namespace DotnetSpider.Extraction.Model
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;

namespace DotnetSpider.Extraction.Model
{
/// <summary>
/// 选择器特性
/// </summary>
public class Selector : System.Attribute
{
[JsonIgnore]
public override object TypeId => base.TypeId;

/// <summary>
/// 构造方法
/// </summary>
Expand Down
6 changes: 5 additions & 1 deletion src/DotnetSpider.Extraction/Model/TableNamePostfix.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
namespace DotnetSpider.Extraction.Model
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;

namespace DotnetSpider.Extraction.Model
{
[JsonConverter(typeof(StringEnumConverter))]
public enum TableNamePostfix
{
None,
Expand Down
5 changes: 4 additions & 1 deletion src/DotnetSpider.Extraction/SelectorType.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
using System;
using Newtonsoft.Json;
using Newtonsoft.Json.Converters;
using System;

namespace DotnetSpider.Extraction
{
/// <summary>
/// 查询器类型
/// </summary>
[Flags]
[JsonConverter(typeof(StringEnumConverter))]
public enum SelectorType
{
/// <summary>
Expand Down
Loading

0 comments on commit 254aaf5

Please sign in to comment.